diff -Nru elpa-2016.05.001/aclocal.m4 elpa-2019.11.001/aclocal.m4
--- elpa-2016.05.001/aclocal.m4	2016-05-20 07:04:35.000000000 +0000
+++ elpa-2019.11.001/aclocal.m4	2019-12-21 16:29:44.000000000 +0000
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.15 -*- Autoconf -*-
+# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
 
-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])
 
-# Copyright (C) 2002-2014 Free Software Foundation, Inc.
+# Copyright (C) 2002-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.15'
+[am__api_version='1.16'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.15], [],
+m4_if([$1], [1.16.1], [],
       [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])
 
@@ -51,12 +51,12 @@
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.15])dnl
+[AM_AUTOMAKE_VERSION([1.16.1])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
-# Copyright (C) 2011-2014 Free Software Foundation, Inc.
+# Copyright (C) 2011-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -118,7 +118,7 @@
 
 # Figure out how to run the assembler.                      -*- Autoconf -*-
 
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -138,7 +138,7 @@
 
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -190,7 +190,7 @@
 
 # AM_CONDITIONAL                                            -*- Autoconf -*-
 
-# Copyright (C) 1997-2014 Free Software Foundation, Inc.
+# Copyright (C) 1997-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -221,7 +221,7 @@
 Usually this means the macro was only invoked conditionally.]])
 fi])])
 
-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -412,13 +412,12 @@
 
 # Generate code to set up dependency tracking.              -*- Autoconf -*-
 
-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-
 # _AM_OUTPUT_DEPENDENCY_COMMANDS
 # ------------------------------
 AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
@@ -426,49 +425,41 @@
   # Older Autoconf quotes --file arguments for eval, but not when files
   # are listed without --file.  Let's play safe and only enable the eval
   # if we detect the quoting.
-  case $CONFIG_FILES in
-  *\'*) eval set x "$CONFIG_FILES" ;;
-  *)   set x $CONFIG_FILES ;;
-  esac
+  # TODO: see whether this extra hack can be removed once we start
+  # requiring Autoconf 2.70 or later.
+  AS_CASE([$CONFIG_FILES],
+          [*\'*], [eval set x "$CONFIG_FILES"],
+          [*], [set x $CONFIG_FILES])
   shift
-  for mf
+  # Used to flag and report bootstrapping failures.
+  am_rc=0
+  for am_mf
   do
     # Strip MF so we end up with the name of the file.
-    mf=`echo "$mf" | sed -e 's/:.*$//'`
-    # Check whether this is an Automake generated Makefile or not.
-    # We used to match only the files named 'Makefile.in', but
-    # some people rename them; so instead we look at the file content.
-    # Grep'ing the first line is not enough: some people post-process
-    # each Makefile.in and add a new line on top of each file to say so.
-    # Grep'ing the whole file is not good either: AIX grep has a line
+    am_mf=`AS_ECHO(["$am_mf"]) | sed -e 's/:.*$//'`
+    # Check whether this is an Automake generated Makefile which includes
+    # dependency-tracking related rules and includes.
+    # Grep'ing the whole file directly is not great: AIX grep has a line
     # limit of 2048, but all sed's we know have understand at least 4000.
-    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
-      dirpart=`AS_DIRNAME("$mf")`
-    else
-      continue
-    fi
-    # Extract the definition of DEPDIR, am__include, and am__quote
-    # from the Makefile without running 'make'.
-    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
-    test -z "$DEPDIR" && continue
-    am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "$am__include" && continue
-    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-    # Find all dependency output files, they are included files with
-    # $(DEPDIR) in their names.  We invoke sed twice because it is the
-    # simplest approach to changing $(DEPDIR) to its actual value in the
-    # expansion.
-    for file in `sed -n "
-      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
-      # Make sure the directory exists.
-      test -f "$dirpart/$file" && continue
-      fdir=`AS_DIRNAME(["$file"])`
-      AS_MKDIR_P([$dirpart/$fdir])
-      # echo "creating $dirpart/$file"
-      echo '# dummy' > "$dirpart/$file"
-    done
+    sed -n 's,^am--depfiles:.*,X,p' "$am_mf" | grep X >/dev/null 2>&1 \
+      || continue
+    am_dirpart=`AS_DIRNAME(["$am_mf"])`
+    am_filepart=`AS_BASENAME(["$am_mf"])`
+    AM_RUN_LOG([cd "$am_dirpart" \
+      && sed -e '/# am--include-marker/d' "$am_filepart" \
+        | $MAKE -f - am--depfiles]) || am_rc=$?
   done
+  if test $am_rc -ne 0; then
+    AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
+    for automatic dependency tracking.  Try re-running configure with the
+    '--disable-dependency-tracking' option to at least be able to build
+    the package (albeit without support for automatic dependency tracking).])
+  fi
+  AS_UNSET([am_dirpart])
+  AS_UNSET([am_filepart])
+  AS_UNSET([am_mf])
+  AS_UNSET([am_rc])
+  rm -f conftest-deps.mk
 }
 ])# _AM_OUTPUT_DEPENDENCY_COMMANDS
 
@@ -477,18 +468,17 @@
 # -----------------------------
 # This macro should only be invoked once -- use via AC_REQUIRE.
 #
-# This code is only required when automatic dependency tracking
-# is enabled.  FIXME.  This creates each '.P' file that we will
-# need in order to bootstrap the dependency handling code.
+# This code is only required when automatic dependency tracking is enabled.
+# This creates each '.Po' and '.Plo' makefile fragment that we'll need in
+# order to bootstrap the dependency handling code.
 AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 [AC_CONFIG_COMMANDS([depfiles],
      [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
-     [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
-])
+     [AMDEP_TRUE="$AMDEP_TRUE" MAKE="${MAKE-make}"])])
 
 # Do all the work for Automake.                             -*- Autoconf -*-
 
-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -575,8 +565,8 @@
 AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 # For better backward compatibility.  To be removed once Automake 1.9.x
 # dies out for good.  For more background, see:
-# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
-# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
+# <https://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
+# <https://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
 # We need awk for the "check" target (and possibly the TAP driver).  The
 # system "awk" is bad on some platforms.
@@ -643,7 +633,7 @@
 Aborting the configuration process, to ensure you take notice of the issue.
 
 You can download and install GNU coreutils to get an 'rm' implementation
-that behaves properly: <http://www.gnu.org/software/coreutils/>.
+that behaves properly: <https://www.gnu.org/software/coreutils/>.
 
 If you want to complete the configuration process using your problematic
 'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
@@ -685,7 +675,7 @@
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
 
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -706,7 +696,7 @@
 fi
 AC_SUBST([install_sh])])
 
-# Copyright (C) 2003-2014 Free Software Foundation, Inc.
+# Copyright (C) 2003-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -725,45 +715,9 @@
 rmdir .tst 2>/dev/null
 AC_SUBST([am__leading_dot])])
 
-# Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
-# From Jim Meyering
-
-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# AM_MAINTAINER_MODE([DEFAULT-MODE])
-# ----------------------------------
-# Control maintainer-specific portions of Makefiles.
-# Default is to disable them, unless 'enable' is passed literally.
-# For symmetry, 'disable' may be passed as well.  Anyway, the user
-# can override the default with the --enable/--disable switch.
-AC_DEFUN([AM_MAINTAINER_MODE],
-[m4_case(m4_default([$1], [disable]),
-       [enable], [m4_define([am_maintainer_other], [disable])],
-       [disable], [m4_define([am_maintainer_other], [enable])],
-       [m4_define([am_maintainer_other], [enable])
-        m4_warn([syntax], [unexpected argument to AM@&t@_MAINTAINER_MODE: $1])])
-AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
-  dnl maintainer-mode's default is 'disable' unless 'enable' is passed
-  AC_ARG_ENABLE([maintainer-mode],
-    [AS_HELP_STRING([--]am_maintainer_other[-maintainer-mode],
-      am_maintainer_other[ make rules and dependencies not useful
-      (and sometimes confusing) to the casual installer])],
-    [USE_MAINTAINER_MODE=$enableval],
-    [USE_MAINTAINER_MODE=]m4_if(am_maintainer_other, [enable], [no], [yes]))
-  AC_MSG_RESULT([$USE_MAINTAINER_MODE])
-  AM_CONDITIONAL([MAINTAINER_MODE], [test $USE_MAINTAINER_MODE = yes])
-  MAINT=$MAINTAINER_MODE_TRUE
-  AC_SUBST([MAINT])dnl
-]
-)
-
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
 
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -771,49 +725,42 @@
 
 # AM_MAKE_INCLUDE()
 # -----------------
-# Check to see how make treats includes.
+# Check whether make has an 'include' directive that can support all
+# the idioms we need for our automatic dependency tracking code.
 AC_DEFUN([AM_MAKE_INCLUDE],
-[am_make=${MAKE-make}
-cat > confinc << 'END'
+[AC_MSG_CHECKING([whether ${MAKE-make} supports the include directive])
+cat > confinc.mk << 'END'
 am__doit:
-	@echo this is the am__doit target
+	@echo this is the am__doit target >confinc.out
 .PHONY: am__doit
 END
-# If we don't find an include directive, just comment out the code.
-AC_MSG_CHECKING([for style of include used by $am_make])
 am__include="#"
 am__quote=
-_am_result=none
-# First try GNU make style include.
-echo "include confinc" > confmf
-# Ignore all kinds of additional output from 'make'.
-case `$am_make -s -f confmf 2> /dev/null` in #(
-*the\ am__doit\ target*)
-  am__include=include
-  am__quote=
-  _am_result=GNU
-  ;;
-esac
-# Now try BSD make style include.
-if test "$am__include" = "#"; then
-   echo '.include "confinc"' > confmf
-   case `$am_make -s -f confmf 2> /dev/null` in #(
-   *the\ am__doit\ target*)
-     am__include=.include
-     am__quote="\""
-     _am_result=BSD
-     ;;
-   esac
-fi
-AC_SUBST([am__include])
-AC_SUBST([am__quote])
-AC_MSG_RESULT([$_am_result])
-rm -f confinc confmf
-])
+# BSD make does it like this.
+echo '.include "confinc.mk" # ignored' > confmf.BSD
+# Other make implementations (GNU, Solaris 10, AIX) do it like this.
+echo 'include confinc.mk # ignored' > confmf.GNU
+_am_result=no
+for s in GNU BSD; do
+  AM_RUN_LOG([${MAKE-make} -f confmf.$s && cat confinc.out])
+  AS_CASE([$?:`cat confinc.out 2>/dev/null`],
+      ['0:this is the am__doit target'],
+      [AS_CASE([$s],
+          [BSD], [am__include='.include' am__quote='"'],
+          [am__include='include' am__quote=''])])
+  if test "$am__include" != "#"; then
+    _am_result="yes ($s style)"
+    break
+  fi
+done
+rm -f confinc.* confmf.*
+AC_MSG_RESULT([${_am_result}])
+AC_SUBST([am__include])])
+AC_SUBST([am__quote])])
 
 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
 
-# Copyright (C) 1997-2014 Free Software Foundation, Inc.
+# Copyright (C) 1997-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -852,7 +799,7 @@
 
 # Helper functions for option handling.                     -*- Autoconf -*-
 
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -881,7 +828,7 @@
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
 
-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -928,7 +875,245 @@
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
 
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+
+# AM_PATH_PYTHON([MINIMUM-VERSION], [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+# ---------------------------------------------------------------------------
+# Adds support for distributing Python modules and packages.  To
+# install modules, copy them to $(pythondir), using the python_PYTHON
+# automake variable.  To install a package with the same name as the
+# automake package, install to $(pkgpythondir), or use the
+# pkgpython_PYTHON automake variable.
+#
+# The variables $(pyexecdir) and $(pkgpyexecdir) are provided as
+# locations to install python extension modules (shared libraries).
+# Another macro is required to find the appropriate flags to compile
+# extension modules.
+#
+# If your package is configured with a different prefix to python,
+# users will have to add the install directory to the PYTHONPATH
+# environment variable, or create a .pth file (see the python
+# documentation for details).
+#
+# If the MINIMUM-VERSION argument is passed, AM_PATH_PYTHON will
+# cause an error if the version of python installed on the system
+# doesn't meet the requirement.  MINIMUM-VERSION should consist of
+# numbers and dots only.
+AC_DEFUN([AM_PATH_PYTHON],
+ [
+  dnl Find a Python interpreter.  Python versions prior to 2.0 are not
+  dnl supported. (2.0 was released on October 16, 2000).
+  m4_define_default([_AM_PYTHON_INTERPRETER_LIST],
+[python python2 python3 dnl
+ python3.9 python3.8 python3.7 python3.6 python3.5 python3.4 python3.3 dnl
+ python3.2 python3.1 python3.0 dnl
+ python2.7 python2.6 python2.5 python2.4 python2.3 python2.2 python2.1 dnl
+ python2.0])
+
+  AC_ARG_VAR([PYTHON], [the Python interpreter])
+
+  m4_if([$1],[],[
+    dnl No version check is needed.
+    # Find any Python interpreter.
+    if test -z "$PYTHON"; then
+      AC_PATH_PROGS([PYTHON], _AM_PYTHON_INTERPRETER_LIST, :)
+    fi
+    am_display_PYTHON=python
+  ], [
+    dnl A version check is needed.
+    if test -n "$PYTHON"; then
+      # If the user set $PYTHON, use it and don't search something else.
+      AC_MSG_CHECKING([whether $PYTHON version is >= $1])
+      AM_PYTHON_CHECK_VERSION([$PYTHON], [$1],
+			      [AC_MSG_RESULT([yes])],
+			      [AC_MSG_RESULT([no])
+			       AC_MSG_ERROR([Python interpreter is too old])])
+      am_display_PYTHON=$PYTHON
+    else
+      # Otherwise, try each interpreter until we find one that satisfies
+      # VERSION.
+      AC_CACHE_CHECK([for a Python interpreter with version >= $1],
+	[am_cv_pathless_PYTHON],[
+	for am_cv_pathless_PYTHON in _AM_PYTHON_INTERPRETER_LIST none; do
+	  test "$am_cv_pathless_PYTHON" = none && break
+	  AM_PYTHON_CHECK_VERSION([$am_cv_pathless_PYTHON], [$1], [break])
+	done])
+      # Set $PYTHON to the absolute path of $am_cv_pathless_PYTHON.
+      if test "$am_cv_pathless_PYTHON" = none; then
+	PYTHON=:
+      else
+        AC_PATH_PROG([PYTHON], [$am_cv_pathless_PYTHON])
+      fi
+      am_display_PYTHON=$am_cv_pathless_PYTHON
+    fi
+  ])
+
+  if test "$PYTHON" = :; then
+  dnl Run any user-specified action, or abort.
+    m4_default([$3], [AC_MSG_ERROR([no suitable Python interpreter found])])
+  else
+
+  dnl Query Python for its version number.  Getting [:3] seems to be
+  dnl the best way to do this; it's what "site.py" does in the standard
+  dnl library.
+
+  AC_CACHE_CHECK([for $am_display_PYTHON version], [am_cv_python_version],
+    [am_cv_python_version=`$PYTHON -c "import sys; sys.stdout.write(sys.version[[:3]])"`])
+  AC_SUBST([PYTHON_VERSION], [$am_cv_python_version])
+
+  dnl Use the values of $prefix and $exec_prefix for the corresponding
+  dnl values of PYTHON_PREFIX and PYTHON_EXEC_PREFIX.  These are made
+  dnl distinct variables so they can be overridden if need be.  However,
+  dnl general consensus is that you shouldn't need this ability.
+
+  AC_SUBST([PYTHON_PREFIX], ['${prefix}'])
+  AC_SUBST([PYTHON_EXEC_PREFIX], ['${exec_prefix}'])
+
+  dnl At times (like when building shared libraries) you may want
+  dnl to know which OS platform Python thinks this is.
+
+  AC_CACHE_CHECK([for $am_display_PYTHON platform], [am_cv_python_platform],
+    [am_cv_python_platform=`$PYTHON -c "import sys; sys.stdout.write(sys.platform)"`])
+  AC_SUBST([PYTHON_PLATFORM], [$am_cv_python_platform])
+
+  # Just factor out some code duplication.
+  am_python_setup_sysconfig="\
+import sys
+# Prefer sysconfig over distutils.sysconfig, for better compatibility
+# with python 3.x.  See automake bug#10227.
+try:
+    import sysconfig
+except ImportError:
+    can_use_sysconfig = 0
+else:
+    can_use_sysconfig = 1
+# Can't use sysconfig in CPython 2.7, since it's broken in virtualenvs:
+# <https://github.com/pypa/virtualenv/issues/118>
+try:
+    from platform import python_implementation
+    if python_implementation() == 'CPython' and sys.version[[:3]] == '2.7':
+        can_use_sysconfig = 0
+except ImportError:
+    pass"
+
+  dnl Set up 4 directories:
+
+  dnl pythondir -- where to install python scripts.  This is the
+  dnl   site-packages directory, not the python standard library
+  dnl   directory like in previous automake betas.  This behavior
+  dnl   is more consistent with lispdir.m4 for example.
+  dnl Query distutils for this directory.
+  AC_CACHE_CHECK([for $am_display_PYTHON script directory],
+    [am_cv_python_pythondir],
+    [if test "x$prefix" = xNONE
+     then
+       am_py_prefix=$ac_default_prefix
+     else
+       am_py_prefix=$prefix
+     fi
+     am_cv_python_pythondir=`$PYTHON -c "
+$am_python_setup_sysconfig
+if can_use_sysconfig:
+    sitedir = sysconfig.get_path('purelib', vars={'base':'$am_py_prefix'})
+else:
+    from distutils import sysconfig
+    sitedir = sysconfig.get_python_lib(0, 0, prefix='$am_py_prefix')
+sys.stdout.write(sitedir)"`
+     case $am_cv_python_pythondir in
+     $am_py_prefix*)
+       am__strip_prefix=`echo "$am_py_prefix" | sed 's|.|.|g'`
+       am_cv_python_pythondir=`echo "$am_cv_python_pythondir" | sed "s,^$am__strip_prefix,$PYTHON_PREFIX,"`
+       ;;
+     *)
+       case $am_py_prefix in
+         /usr|/System*) ;;
+         *)
+	  am_cv_python_pythondir=$PYTHON_PREFIX/lib/python$PYTHON_VERSION/site-packages
+	  ;;
+       esac
+       ;;
+     esac
+    ])
+  AC_SUBST([pythondir], [$am_cv_python_pythondir])
+
+  dnl pkgpythondir -- $PACKAGE directory under pythondir.  Was
+  dnl   PYTHON_SITE_PACKAGE in previous betas, but this naming is
+  dnl   more consistent with the rest of automake.
+
+  AC_SUBST([pkgpythondir], [\${pythondir}/$PACKAGE])
+
+  dnl pyexecdir -- directory for installing python extension modules
+  dnl   (shared libraries)
+  dnl Query distutils for this directory.
+  AC_CACHE_CHECK([for $am_display_PYTHON extension module directory],
+    [am_cv_python_pyexecdir],
+    [if test "x$exec_prefix" = xNONE
+     then
+       am_py_exec_prefix=$am_py_prefix
+     else
+       am_py_exec_prefix=$exec_prefix
+     fi
+     am_cv_python_pyexecdir=`$PYTHON -c "
+$am_python_setup_sysconfig
+if can_use_sysconfig:
+    sitedir = sysconfig.get_path('platlib', vars={'platbase':'$am_py_prefix'})
+else:
+    from distutils import sysconfig
+    sitedir = sysconfig.get_python_lib(1, 0, prefix='$am_py_prefix')
+sys.stdout.write(sitedir)"`
+     case $am_cv_python_pyexecdir in
+     $am_py_exec_prefix*)
+       am__strip_prefix=`echo "$am_py_exec_prefix" | sed 's|.|.|g'`
+       am_cv_python_pyexecdir=`echo "$am_cv_python_pyexecdir" | sed "s,^$am__strip_prefix,$PYTHON_EXEC_PREFIX,"`
+       ;;
+     *)
+       case $am_py_exec_prefix in
+         /usr|/System*) ;;
+         *)
+	   am_cv_python_pyexecdir=$PYTHON_EXEC_PREFIX/lib/python$PYTHON_VERSION/site-packages
+	   ;;
+       esac
+       ;;
+     esac
+    ])
+  AC_SUBST([pyexecdir], [$am_cv_python_pyexecdir])
+
+  dnl pkgpyexecdir -- $(pyexecdir)/$(PACKAGE)
+
+  AC_SUBST([pkgpyexecdir], [\${pyexecdir}/$PACKAGE])
+
+  dnl Run any user-specified action.
+  $2
+  fi
+
+])
+
+
+# AM_PYTHON_CHECK_VERSION(PROG, VERSION, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
+# ---------------------------------------------------------------------------
+# Run ACTION-IF-TRUE if the Python interpreter PROG has version >= VERSION.
+# Run ACTION-IF-FALSE otherwise.
+# This test uses sys.hexversion instead of the string equivalent (first
+# word of sys.version), in order to cope with versions such as 2.2c1.
+# This supports Python 2.0 or higher. (2.0 was released on October 16, 2000).
+AC_DEFUN([AM_PYTHON_CHECK_VERSION],
+ [prog="import sys
+# split strings by '.' and convert to numeric.  Append some zeros
+# because we need at least 4 digits for the hex conversion.
+# map returns an iterator in Python 3.0 and a list in 2.x
+minver = list(map(int, '$2'.split('.'))) + [[0, 0, 0]]
+minverhex = 0
+# xrange is not present in Python 3.0 and range returns an iterator
+for i in list(range(0, 4)): minverhex = (minverhex << 8) + minver[[i]]
+sys.exit(sys.hexversion < minverhex)"
+  AS_IF([AM_RUN_LOG([$1 -c "$prog"])], [$3], [$4])])
+
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -947,7 +1132,7 @@
 
 # Check to make sure that the build environment is sane.    -*- Autoconf -*-
 
-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1028,7 +1213,7 @@
 rm -f conftest.file
 ])
 
-# Copyright (C) 2009-2014 Free Software Foundation, Inc.
+# Copyright (C) 2009-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1088,7 +1273,7 @@
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])
 
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1116,7 +1301,7 @@
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])
 
-# Copyright (C) 2006-2014 Free Software Foundation, Inc.
+# Copyright (C) 2006-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1135,7 +1320,7 @@
 
 # Check how to create a tarball.                            -*- Autoconf -*-
 
-# Copyright (C) 2004-2014 Free Software Foundation, Inc.
+# Copyright (C) 2004-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1267,6 +1452,9 @@
 ]) # _AM_PROG_TAR
 
 m4_include([m4/ax_check_gnu_make.m4])
+m4_include([m4/ax_ext.m4])
+m4_include([m4/ax_gcc_x86_avx_xgetbv.m4])
+m4_include([m4/ax_gcc_x86_cpuid.m4])
 m4_include([m4/ax_prog_cc_mpi.m4])
 m4_include([m4/ax_prog_doxygen.m4])
 m4_include([m4/libtool.m4])
@@ -1274,3 +1462,4 @@
 m4_include([m4/ltsugar.m4])
 m4_include([m4/ltversion.m4])
 m4_include([m4/lt~obsolete.m4])
+m4_include([m4/m4_ax_check_compile_flag.m4])
diff -Nru elpa-2016.05.001/ar-lib elpa-2019.11.001/ar-lib
--- elpa-2016.05.001/ar-lib	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/ar-lib	2019-12-21 16:29:46.000000000 +0000
@@ -4,7 +4,7 @@
 me=ar-lib
 scriptversion=2012-03-01.08; # UTC
 
-# Copyright (C) 2010-2014 Free Software Foundation, Inc.
+# Copyright (C) 2010-2018 Free Software Foundation, Inc.
 # Written by Peter Rosin <peda@lysator.liu.se>.
 #
 # This program is free software; you can redistribute it and/or modify
@@ -18,7 +18,7 @@
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
diff -Nru elpa-2016.05.001/Changelog elpa-2019.11.001/Changelog
--- elpa-2016.05.001/Changelog	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/Changelog	2019-12-20 05:57:47.000000000 +0000
@@ -1,3 +1,164 @@
+Changelog for upcoming release
+
+- not yet decided
+
+Changelog for ELPA 2019.11.001
+
+- solve a bug when using parallel make builds
+- check the cpuid set during build time
+- add experimental feature "heterogenous-cluster-support"
+- add experimental feature for 64bit integer LAS/LAPACK/SCALAPACK support
+- add experimental feature for 64bit integer MPI support
+- support of ELPA for real valued skew-symmetric matrices, please cite:
+  https://arxiv.org/abs/1912.04062 
+- cleanup of the GPU version
+- bugfix in the OpenMP version
+- bugfix on the Power8/9 kernels
+- bugfix on ARM aarch64 FMA kernels
+
+
+Changelog for ELPA 2019.05.002
+
+- repacking of the src since the legacy interface has been forgotten in the
+  2019.05.001 release
+
+Changelog for ELPA 2019.05.001
+
+- elpa_print_kernels supports GPU usage
+- fix an error if PAPI measurements are activated
+- new simple real kernels: block4 and block6
+- c functions can be build with optional arguments if compiler supports it
+(configure option)
+- allow measurements with the likwid tool
+- users can define the default-kernel at build time
+- ELPA versioning number is provided in the C header files
+- as announced a year ago, the following deprecated routines have been finally
+removed; see DEPRECATED_FEATURES for the replacement routines , which have
+been introduced a year ago. Removed routines:
+  -> mult_at_b_real
+  -> mult_ah_b_complex
+  -> invert_trm_real
+  -> invert_trm_complex
+  -> cholesky_real
+  -> cholesky_complex
+  -> solve_tridi
+- new kernels for ARM arch64 added
+- fix an out-of-bound-error in elpa2
+
+
+Changelog for ELPA 2018.11.001
+
+- improved autotuning
+- improved performance of generalized problem via Cannon's algorithm
+- check pointing functionality of elpa objects
+- store/read/resume of autotuning
+- Python interface for ELPA
+- more ELPA functions have an optional error argument (Fortran) or required
+error argument (C) => ABI and API change
+
+
+Changelog for ELPA 2018.05.001
+
+- significant improved performance on K-computer
+- added interface for the generalized eigenvalue problem
+- extended autotuning functionality
+
+Changelog for ELPA 2017.11.001
+
+- significant improvement of performance of GPU version
+- added new compute kernels for IBM Power8 and Fujistu Sparc64
+  processors
+- a first implementation of autotuning capability
+- correct some type statements in Fortran
+- correct detection of PAPI in configure step
+
+Changelog for ELPA 2017.05.003
+
+- remove bug in invert_triangular, which had been introduced
+  in ELPA 2017.05.002
+
+Changelog for ELPA 2017.05.002
+
+Mainly bugfixes for ELPA 2017.05.001:
+- fix memory leak of MPI communicators
+- tests for hermitian_multiply, cholesky decomposition and
+- deal with a problem on Debian (mawk)
+
+Changelog for ELPA 2017.05.001
+
+Final release of ELPA 2017.05.001
+Since rc2 the following changes have been made
+- more extensive tests during "make check"
+- distribute missing C headers
+- introduce analytic tests
+- Fix stack overflow in some kernels
+
+Changelog for ELPA 2017.05.001.rc2
+
+This is the release candidate 2 for the ELPA 2017.05.001 version.
+Additionaly to the changes from rc1, it fixes some smaller issues
+- add missing script "manual_cpp"
+- cleanup of code
+
+Changelog for ELPA 2017.05.001.rc1
+
+This is the release candidate 1 for the ELPA 2017.05.001 version.
+It provides a first version of the new, more generic API of the ELPA library.
+Smaller changes to the API might be possible in the upcoming release
+candidates. For users, who would like to use the older API of the ELPA
+library, the API as defined with release 2016.11.001.pre is frozen in and
+also supported.
+
+Apart of the API change to be more flexible for the future, this release
+offers the following changes:
+
+- faster GPU implementation, especially for ELPA 1stage
+- the restriction of the block-cyclic distribution blocksize = 128 in the GPU
+  case is relaxed
+- Faster CPU implementation due to better blocking
+- support of already banded matrices (new API only!)
+- improved KNL support
+
+Changelog for pre-release ELPA 2016.11.001.pre
+
+This pre-release contains an experimental API which will most likely
+change in the next stable release
+
+- also suport of single-precision (real and complex case) eigenvalule problems
+- GPU support in ELPA 1stage and 2stage (real and complex case)
+- change of API (w.r.t. ELPA 2016.05.004) to support runtime-choice of GPU usage
+
+Changelog for release ELPA 2016.05.004
+
+- fix a problem with the private state of module precision
+- distribute test_project with dist tarball
+- generic driver routine for ELPA 1stage and 2stage
+- test case for elpa_mult_at_b_real
+- test case for elpa_mult_ah_b_complex
+- test case for elpa_cholesky_real
+- test case for elpa_cholesky_complex
+- test case for elpa_invert_trm_real
+- test case for elpa_invert_trm_complex
+- fix building of static library
+- better choice of AVX, AVX2, AVX512 kernels
+- make assumed size Fortran arrays default
+
+Changelog for release ELPA 2016.05.003
+
+- fix a problem with the build of SSE kernels
+- make some (internal) functions public, such that they
+  can be used outside of ELPA
+- add documentation and interfaces for new public functions
+- shorten file namses and directory names for test programs
+  in under to by pass "make agrument list too long" error
+
+Changelog for release ELPA 2016.05.002
+
+- fix problem with generated *.sh- check scripts
+- name library differently if build without MPI support
+- install only public modules
+
+
 Changelog for release ELPA 2016.05.001
 
 - support building without MPI for one node usage
diff -Nru elpa-2016.05.001/compile elpa-2019.11.001/compile
--- elpa-2016.05.001/compile	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/compile	2019-12-21 16:29:46.000000000 +0000
@@ -1,9 +1,9 @@
 #! /bin/sh
 # Wrapper for compilers which do not understand '-c -o'.
 
-scriptversion=2012-10-14.11; # UTC
+scriptversion=2018-03-07.03; # UTC
 
-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 # Written by Tom Tromey <tromey@cygnus.com>.
 #
 # This program is free software; you can redistribute it and/or modify
@@ -17,7 +17,7 @@
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -255,7 +255,8 @@
     echo "compile $scriptversion"
     exit $?
     ;;
-  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
+  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe | \
+  icl | *[/\\]icl | icl.exe | *[/\\]icl.exe )
     func_cl_wrapper "$@"      # Doesn't return...
     ;;
 esac
@@ -339,9 +340,9 @@
 # Local Variables:
 # mode: shell-script
 # sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "scriptversion="
 # time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
+# time-stamp-time-zone: "UTC0"
 # time-stamp-end: "; # UTC"
 # End:
diff -Nru elpa-2016.05.001/config.guess elpa-2019.11.001/config.guess
--- elpa-2016.05.001/config.guess	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/config.guess	2019-12-21 16:29:46.000000000 +0000
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2014 Free Software Foundation, Inc.
+#   Copyright 1992-2018 Free Software Foundation, Inc.
 
-timestamp='2014-11-04'
+timestamp='2018-03-08'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -15,7 +15,7 @@
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -27,7 +27,7 @@
 # Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
 #
 # You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
 #
 # Please send patches to <config-patches@gnu.org>.
 
@@ -39,7 +39,7 @@
 
 Output the configuration name of the system \`$me' is run on.
 
-Operation modes:
+Options:
   -h, --help         print this help, then exit
   -t, --time-stamp   print date of last modification, then exit
   -v, --version      print version number, then exit
@@ -50,7 +50,7 @@
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright 1992-2014 Free Software Foundation, Inc.
+Copyright 1992-2018 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -107,9 +107,9 @@
 dummy=$tmp/dummy ;
 tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
 case $CC_FOR_BUILD,$HOST_CC,$CC in
- ,,)    echo "int x;" > $dummy.c ;
+ ,,)    echo "int x;" > "$dummy.c" ;
 	for c in cc gcc c89 c99 ; do
-	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	  if ($c -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then
 	     CC_FOR_BUILD="$c"; break ;
 	  fi ;
 	done ;
@@ -132,14 +132,14 @@
 UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
 UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 
-case "${UNAME_SYSTEM}" in
+case "$UNAME_SYSTEM" in
 Linux|GNU|GNU/*)
 	# If the system lacks a compiler, then just pick glibc.
 	# We could probably try harder.
 	LIBC=gnu
 
-	eval $set_cc_for_build
-	cat <<-EOF > $dummy.c
+	eval "$set_cc_for_build"
+	cat <<-EOF > "$dummy.c"
 	#include <features.h>
 	#if defined(__UCLIBC__)
 	LIBC=uclibc
@@ -149,7 +149,14 @@
 	LIBC=gnu
 	#endif
 	EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
+	eval "`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`"
+
+	# If ldd exists, use it to detect musl libc.
+	if command -v ldd >/dev/null && \
+		ldd --version 2>&1 | grep -q ^musl
+	then
+	    LIBC=musl
+	fi
 	;;
 esac
 
@@ -165,7 +172,7 @@
 
 # Note: order is significant - the case branches are not exclusive.
 
-case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in
     *:NetBSD:*:*)
 	# NetBSD (nbsd) targets should (where applicable) match one or
 	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
@@ -178,21 +185,31 @@
 	# Note: NetBSD doesn't particularly care about the vendor
 	# portion of the name.  We always set it to "unknown".
 	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
-	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
-	case "${UNAME_MACHINE_ARCH}" in
+	UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
+	    "/sbin/$sysctl" 2>/dev/null || \
+	    "/usr/sbin/$sysctl" 2>/dev/null || \
+	    echo unknown)`
+	case "$UNAME_MACHINE_ARCH" in
 	    armeb) machine=armeb-unknown ;;
 	    arm*) machine=arm-unknown ;;
 	    sh3el) machine=shl-unknown ;;
 	    sh3eb) machine=sh-unknown ;;
 	    sh5el) machine=sh5le-unknown ;;
-	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	    earmv*)
+		arch=`echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
+		endian=`echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p'`
+		machine="${arch}${endian}"-${VENDOR}-unknown
+		;;
+	    *) machine="$UNAME_MACHINE_ARCH"-${VENDOR}-unknown ;;
 	esac
 	# The Operating System including object format, if it has switched
-	# to ELF recently, or will in the future.
-	case "${UNAME_MACHINE_ARCH}" in
+	# to ELF recently (or will in the future) and ABI.
+	case "$UNAME_MACHINE_ARCH" in
+	    earm*)
+		os=netbsdelf
+		;;
 	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
-		eval $set_cc_for_build
+		eval "$set_cc_for_build"
 		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
 			| grep -q __ELF__
 		then
@@ -207,44 +224,67 @@
 		os=netbsd
 		;;
 	esac
+	# Determine ABI tags.
+	case "$UNAME_MACHINE_ARCH" in
+	    earm*)
+		expr='s/^earmv[0-9]/-eabi/;s/eb$//'
+		abi=`echo "$UNAME_MACHINE_ARCH" | sed -e "$expr"`
+		;;
+	esac
 	# The OS release
 	# Debian GNU/NetBSD machines have a different userland, and
 	# thus, need a distinct triplet. However, they do not need
 	# kernel version information, so it can be replaced with a
 	# suitable tag, in the style of linux-gnu.
-	case "${UNAME_VERSION}" in
+	case "$UNAME_VERSION" in
 	    Debian*)
 		release='-gnu'
 		;;
 	    *)
-		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		release=`echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2`
 		;;
 	esac
 	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
 	# contains redundant information, the shorter form:
 	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "${machine}-${os}${release}"
+	echo "$machine-${os}${release}${abi}"
 	exit ;;
     *:Bitrig:*:*)
 	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
-	echo ${UNAME_MACHINE_ARCH}-${VENDOR}-bitrig${UNAME_RELEASE}
+	echo "$UNAME_MACHINE_ARCH"-${VENDOR}-bitrig"$UNAME_RELEASE"
 	exit ;;
     *:OpenBSD:*:*)
 	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
-	echo ${UNAME_MACHINE_ARCH}-${VENDOR}-openbsd${UNAME_RELEASE}
+	echo "$UNAME_MACHINE_ARCH"-${VENDOR}-openbsd"$UNAME_RELEASE"
+	exit ;;
+    *:LibertyBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
+	echo "$UNAME_MACHINE_ARCH"-${VENDOR}-libertybsd"$UNAME_RELEASE"
+	exit ;;
+    *:MidnightBSD:*:*)
+	echo "$UNAME_MACHINE"-${VENDOR}-midnightbsd"$UNAME_RELEASE"
 	exit ;;
     *:ekkoBSD:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-ekkobsd${UNAME_RELEASE}
+	echo "$UNAME_MACHINE"-${VENDOR}-ekkobsd"$UNAME_RELEASE"
 	exit ;;
     *:SolidBSD:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-solidbsd${UNAME_RELEASE}
+	echo "$UNAME_MACHINE"-${VENDOR}-solidbsd"$UNAME_RELEASE"
 	exit ;;
     macppc:MirBSD:*:*)
-	echo powerpc-${VENDOR}-mirbsd${UNAME_RELEASE}
+	echo powerpc-${VENDOR}-mirbsd"$UNAME_RELEASE"
 	exit ;;
     *:MirBSD:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-mirbsd${UNAME_RELEASE}
+	echo "$UNAME_MACHINE"-${VENDOR}-mirbsd"$UNAME_RELEASE"
 	exit ;;
+    *:Sortix:*:*)
+	echo "$UNAME_MACHINE"-${VENDOR}-sortix
+	exit ;;
+    *:Redox:*:*)
+	echo "$UNAME_MACHINE"-${VENDOR}-redox
+	exit ;;
+    mips:OSF1:*.*)
+        echo mips-dec-osf1
+        exit ;;
     alpha:OSF1:*:*)
 	case $UNAME_RELEASE in
 	*4.0)
@@ -261,63 +301,54 @@
 	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
 	case "$ALPHA_CPU_TYPE" in
 	    "EV4 (21064)")
-		UNAME_MACHINE="alpha" ;;
+		UNAME_MACHINE=alpha ;;
 	    "EV4.5 (21064)")
-		UNAME_MACHINE="alpha" ;;
+		UNAME_MACHINE=alpha ;;
 	    "LCA4 (21066/21068)")
-		UNAME_MACHINE="alpha" ;;
+		UNAME_MACHINE=alpha ;;
 	    "EV5 (21164)")
-		UNAME_MACHINE="alphaev5" ;;
+		UNAME_MACHINE=alphaev5 ;;
 	    "EV5.6 (21164A)")
-		UNAME_MACHINE="alphaev56" ;;
+		UNAME_MACHINE=alphaev56 ;;
 	    "EV5.6 (21164PC)")
-		UNAME_MACHINE="alphapca56" ;;
+		UNAME_MACHINE=alphapca56 ;;
 	    "EV5.7 (21164PC)")
-		UNAME_MACHINE="alphapca57" ;;
+		UNAME_MACHINE=alphapca57 ;;
 	    "EV6 (21264)")
-		UNAME_MACHINE="alphaev6" ;;
+		UNAME_MACHINE=alphaev6 ;;
 	    "EV6.7 (21264A)")
-		UNAME_MACHINE="alphaev67" ;;
+		UNAME_MACHINE=alphaev67 ;;
 	    "EV6.8CB (21264C)")
-		UNAME_MACHINE="alphaev68" ;;
+		UNAME_MACHINE=alphaev68 ;;
 	    "EV6.8AL (21264B)")
-		UNAME_MACHINE="alphaev68" ;;
+		UNAME_MACHINE=alphaev68 ;;
 	    "EV6.8CX (21264D)")
-		UNAME_MACHINE="alphaev68" ;;
+		UNAME_MACHINE=alphaev68 ;;
 	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE="alphaev69" ;;
+		UNAME_MACHINE=alphaev69 ;;
 	    "EV7 (21364)")
-		UNAME_MACHINE="alphaev7" ;;
+		UNAME_MACHINE=alphaev7 ;;
 	    "EV7.9 (21364A)")
-		UNAME_MACHINE="alphaev79" ;;
+		UNAME_MACHINE=alphaev79 ;;
 	esac
 	# A Pn.n version is a patched version.
 	# A Vn.n version is a released version.
 	# A Tn.n version is a released field test version.
 	# A Xn.n version is an unreleased experimental baselevel.
 	# 1.2 uses "1.2" for uname -r.
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	echo "$UNAME_MACHINE"-dec-osf"`echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`"
 	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
 	exitcode=$?
 	trap '' 0
 	exit $exitcode ;;
-    Alpha\ *:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# Should we change UNAME_MACHINE based on the output of uname instead
-	# of the specific Alpha model?
-	echo alpha-pc-interix
-	exit ;;
-    21064:Windows_NT:50:3)
-	echo alpha-dec-winnt3.5
-	exit ;;
     Amiga*:UNIX_System_V:4.0:*)
 	echo m68k-${VENDOR}-sysv4
 	exit ;;
     *:[Aa]miga[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-amigaos
+	echo "$UNAME_MACHINE"-${VENDOR}-amigaos
 	exit ;;
     *:[Mm]orph[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-morphos
+	echo "$UNAME_MACHINE"-${VENDOR}-morphos
 	exit ;;
     *:OS/390:*:*)
 	echo i370-ibm-openedition
@@ -329,7 +360,7 @@
 	echo powerpc-ibm-os400
 	exit ;;
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-	echo arm-acorn-riscix${UNAME_RELEASE}
+	echo arm-acorn-riscix"$UNAME_RELEASE"
 	exit ;;
     arm*:riscos:*:*|arm*:RISCOS:*:*)
 	echo arm-${VENDOR}-riscos
@@ -356,38 +387,38 @@
 	    sparc) echo sparc-icl-nx7; exit ;;
 	esac ;;
     s390x:SunOS:*:*)
-	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	echo "$UNAME_MACHINE"-ibm-solaris2"`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`"
 	exit ;;
     sun4H:SunOS:5.*:*)
-	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	echo sparc-hal-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`"
 	exit ;;
     sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	echo sparc-sun-solaris2"`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`"
 	exit ;;
     i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
-	echo i386-pc-auroraux${UNAME_RELEASE}
+	echo i386-pc-auroraux"$UNAME_RELEASE"
 	exit ;;
     i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
-	eval $set_cc_for_build
-	SUN_ARCH="i386"
+	eval "$set_cc_for_build"
+	SUN_ARCH=i386
 	# If there is a compiler, see if it is configured for 64-bit objects.
 	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
 	# This test works for both compilers.
-	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+	if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
 	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
 		grep IS_64BIT_ARCH >/dev/null
 	    then
-		SUN_ARCH="x86_64"
+		SUN_ARCH=x86_64
 	    fi
 	fi
-	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	echo "$SUN_ARCH"-pc-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`"
 	exit ;;
     sun4*:SunOS:6*:*)
 	# According to config.sub, this is the proper way to canonicalize
 	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
 	# it's likely to be more like Solaris than SunOS4.
-	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	echo sparc-sun-solaris3"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`"
 	exit ;;
     sun4*:SunOS:*:*)
 	case "`/usr/bin/arch -k`" in
@@ -396,25 +427,25 @@
 		;;
 	esac
 	# Japanese Language versions have a version number like `4.1.3-JL'.
-	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	echo sparc-sun-sunos"`echo "$UNAME_RELEASE"|sed -e 's/-/_/'`"
 	exit ;;
     sun3*:SunOS:*:*)
-	echo m68k-sun-sunos${UNAME_RELEASE}
+	echo m68k-sun-sunos"$UNAME_RELEASE"
 	exit ;;
     sun*:*:4.2BSD:*)
 	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3
 	case "`/bin/arch`" in
 	    sun3)
-		echo m68k-sun-sunos${UNAME_RELEASE}
+		echo m68k-sun-sunos"$UNAME_RELEASE"
 		;;
 	    sun4)
-		echo sparc-sun-sunos${UNAME_RELEASE}
+		echo sparc-sun-sunos"$UNAME_RELEASE"
 		;;
 	esac
 	exit ;;
     aushp:SunOS:*:*)
-	echo sparc-auspex-sunos${UNAME_RELEASE}
+	echo sparc-auspex-sunos"$UNAME_RELEASE"
 	exit ;;
     # The situation for MiNT is a little confusing.  The machine name
     # can be virtually everything (everything which is not
@@ -425,44 +456,44 @@
     # MiNT.  But MiNT is downward compatible to TOS, so this should
     # be no problem.
     atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
+	echo m68k-atari-mint"$UNAME_RELEASE"
 	exit ;;
     atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
+	echo m68k-atari-mint"$UNAME_RELEASE"
 	exit ;;
     *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
+	echo m68k-atari-mint"$UNAME_RELEASE"
 	exit ;;
     milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-	echo m68k-milan-mint${UNAME_RELEASE}
+	echo m68k-milan-mint"$UNAME_RELEASE"
 	exit ;;
     hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-	echo m68k-hades-mint${UNAME_RELEASE}
+	echo m68k-hades-mint"$UNAME_RELEASE"
 	exit ;;
     *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-	echo m68k-${VENDOR}-mint${UNAME_RELEASE}
+	echo m68k-${VENDOR}-mint"$UNAME_RELEASE"
 	exit ;;
     m68k:machten:*:*)
-	echo m68k-apple-machten${UNAME_RELEASE}
+	echo m68k-apple-machten"$UNAME_RELEASE"
 	exit ;;
     powerpc:machten:*:*)
-	echo powerpc-apple-machten${UNAME_RELEASE}
+	echo powerpc-apple-machten"$UNAME_RELEASE"
 	exit ;;
     RISC*:Mach:*:*)
 	echo mips-dec-mach_bsd4.3
 	exit ;;
     RISC*:ULTRIX:*:*)
-	echo mips-dec-ultrix${UNAME_RELEASE}
+	echo mips-dec-ultrix"$UNAME_RELEASE"
 	exit ;;
     VAX*:ULTRIX*:*:*)
-	echo vax-dec-ultrix${UNAME_RELEASE}
+	echo vax-dec-ultrix"$UNAME_RELEASE"
 	exit ;;
     2020:CLIX:*:* | 2430:CLIX:*:*)
-	echo clipper-intergraph-clix${UNAME_RELEASE}
+	echo clipper-intergraph-clix"$UNAME_RELEASE"
 	exit ;;
     mips:*:*:UMIPS | mips:*:*:RISCos)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
+	eval "$set_cc_for_build"
+	sed 's/^	//' << EOF > "$dummy.c"
 #ifdef __cplusplus
 #include <stdio.h>  /* for printf() prototype */
 	int main (int argc, char *argv[]) {
@@ -471,23 +502,23 @@
 #endif
 	#if defined (host_mips) && defined (MIPSEB)
 	#if defined (SYSTYPE_SYSV)
-	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	  printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0);
 	#endif
 	#if defined (SYSTYPE_SVR4)
-	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	  printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0);
 	#endif
 	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	  printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0);
 	#endif
 	#endif
 	  exit (-1);
 	}
 EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c &&
-	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
-	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	$CC_FOR_BUILD -o "$dummy" "$dummy.c" &&
+	  dummyarg=`echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`"$dummy" "$dummyarg"` &&
 	    { echo "$SYSTEM_NAME"; exit; }
-	echo mips-mips-riscos${UNAME_RELEASE}
+	echo mips-mips-riscos"$UNAME_RELEASE"
 	exit ;;
     Motorola:PowerMAX_OS:*:*)
 	echo powerpc-motorola-powermax
@@ -513,17 +544,17 @@
     AViiON:dgux:*:*)
 	# DG/UX returns AViiON for all architectures
 	UNAME_PROCESSOR=`/usr/bin/uname -p`
-	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	if [ "$UNAME_PROCESSOR" = mc88100 ] || [ "$UNAME_PROCESSOR" = mc88110 ]
 	then
-	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
-	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    if [ "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx ] || \
+	       [ "$TARGET_BINARY_INTERFACE"x = x ]
 	    then
-		echo m88k-dg-dgux${UNAME_RELEASE}
+		echo m88k-dg-dgux"$UNAME_RELEASE"
 	    else
-		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+		echo m88k-dg-dguxbcs"$UNAME_RELEASE"
 	    fi
 	else
-	    echo i586-dg-dgux${UNAME_RELEASE}
+	    echo i586-dg-dgux"$UNAME_RELEASE"
 	fi
 	exit ;;
     M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
@@ -540,7 +571,7 @@
 	echo m68k-tektronix-bsd
 	exit ;;
     *:IRIX*:*:*)
-	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	echo mips-sgi-irix"`echo "$UNAME_RELEASE"|sed -e 's/-/_/g'`"
 	exit ;;
     ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
 	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
@@ -552,14 +583,14 @@
 	if [ -x /usr/bin/oslevel ] ; then
 		IBM_REV=`/usr/bin/oslevel`
 	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+		IBM_REV="$UNAME_VERSION.$UNAME_RELEASE"
 	fi
-	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	echo "$UNAME_MACHINE"-ibm-aix"$IBM_REV"
 	exit ;;
     *:AIX:2:3)
 	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		eval $set_cc_for_build
-		sed 's/^		//' << EOF >$dummy.c
+		eval "$set_cc_for_build"
+		sed 's/^		//' << EOF > "$dummy.c"
 		#include <sys/systemcfg.h>
 
 		main()
@@ -570,7 +601,7 @@
 			exit(0);
 			}
 EOF
-		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"`
 		then
 			echo "$SYSTEM_NAME"
 		else
@@ -584,7 +615,7 @@
 	exit ;;
     *:AIX:*:[4567])
 	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
-	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+	if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then
 		IBM_ARCH=rs6000
 	else
 		IBM_ARCH=powerpc
@@ -593,18 +624,18 @@
 		IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
 			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
 	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+		IBM_REV="$UNAME_VERSION.$UNAME_RELEASE"
 	fi
-	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	echo "$IBM_ARCH"-ibm-aix"$IBM_REV"
 	exit ;;
     *:AIX:*:*)
 	echo rs6000-ibm-aix
 	exit ;;
-    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+    ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*)
 	echo romp-ibm-bsd4.4
 	exit ;;
     ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
-	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	echo romp-ibm-bsd"$UNAME_RELEASE"   # 4.3 with uname added to
 	exit ;;                             # report: romp-ibm BSD 4.3
     *:BOSX:*:*)
 	echo rs6000-bull-bosx
@@ -619,28 +650,28 @@
 	echo m68k-hp-bsd4.4
 	exit ;;
     9000/[34678]??:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	case "${UNAME_MACHINE}" in
-	    9000/31? )            HP_ARCH=m68000 ;;
-	    9000/[34]?? )         HP_ARCH=m68k ;;
+	HPUX_REV=`echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//'`
+	case "$UNAME_MACHINE" in
+	    9000/31?)            HP_ARCH=m68000 ;;
+	    9000/[34]??)         HP_ARCH=m68k ;;
 	    9000/[678][0-9][0-9])
 		if [ -x /usr/bin/getconf ]; then
 		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
 		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-		    case "${sc_cpu_version}" in
-		      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
-		      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+		    case "$sc_cpu_version" in
+		      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
 		      532)                      # CPU_PA_RISC2_0
-			case "${sc_kernel_bits}" in
-			  32) HP_ARCH="hppa2.0n" ;;
-			  64) HP_ARCH="hppa2.0w" ;;
-			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+			case "$sc_kernel_bits" in
+			  32) HP_ARCH=hppa2.0n ;;
+			  64) HP_ARCH=hppa2.0w ;;
+			  '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
 			esac ;;
 		    esac
 		fi
-		if [ "${HP_ARCH}" = "" ]; then
-		    eval $set_cc_for_build
-		    sed 's/^		//' << EOF >$dummy.c
+		if [ "$HP_ARCH" = "" ]; then
+		    eval "$set_cc_for_build"
+		    sed 's/^		//' << EOF > "$dummy.c"
 
 		#define _HPUX_SOURCE
 		#include <stdlib.h>
@@ -673,13 +704,13 @@
 		    exit (0);
 		}
 EOF
-		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=`"$dummy"`
 		    test -z "$HP_ARCH" && HP_ARCH=hppa
 		fi ;;
 	esac
-	if [ ${HP_ARCH} = "hppa2.0w" ]
+	if [ "$HP_ARCH" = hppa2.0w ]
 	then
-	    eval $set_cc_for_build
+	    eval "$set_cc_for_build"
 
 	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
 	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
@@ -690,23 +721,23 @@
 	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
 	    # => hppa64-hp-hpux11.23
 
-	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+	    if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
 		grep -q __LP64__
 	    then
-		HP_ARCH="hppa2.0w"
+		HP_ARCH=hppa2.0w
 	    else
-		HP_ARCH="hppa64"
+		HP_ARCH=hppa64
 	    fi
 	fi
-	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	echo "$HP_ARCH"-hp-hpux"$HPUX_REV"
 	exit ;;
     ia64:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	echo ia64-hp-hpux${HPUX_REV}
+	HPUX_REV=`echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux"$HPUX_REV"
 	exit ;;
     3050*:HI-UX:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
+	eval "$set_cc_for_build"
+	sed 's/^	//' << EOF > "$dummy.c"
 	#include <unistd.h>
 	int
 	main ()
@@ -731,11 +762,11 @@
 	  exit (0);
 	}
 EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+	$CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` &&
 		{ echo "$SYSTEM_NAME"; exit; }
 	echo unknown-hitachi-hiuxwe2
 	exit ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*)
 	echo hppa1.1-hp-bsd
 	exit ;;
     9000/8??:4.3bsd:*:*)
@@ -744,7 +775,7 @@
     *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
 	echo hppa1.0-hp-mpeix
 	exit ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*)
 	echo hppa1.1-hp-osf
 	exit ;;
     hp8??:OSF1:*:*)
@@ -752,9 +783,9 @@
 	exit ;;
     i*86:OSF1:*:*)
 	if [ -x /usr/sbin/sysversion ] ; then
-	    echo ${UNAME_MACHINE}-${VENDOR}-osf1mk
+	    echo "$UNAME_MACHINE"-${VENDOR}-osf1mk
 	else
-	    echo ${UNAME_MACHINE}-${VENDOR}-osf1
+	    echo "$UNAME_MACHINE"-${VENDOR}-osf1
 	fi
 	exit ;;
     parisc*:Lites*:*:*)
@@ -779,127 +810,109 @@
 	echo c4-convex-bsd
 	exit ;;
     CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	echo ymp-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
     CRAY*[A-Z]90:*:*:*)
-	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \
 	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
 	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
 	      -e 's/\.[^.]*$/.X/'
 	exit ;;
     CRAY*TS:*:*:*)
-	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	echo t90-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
     CRAY*T3E:*:*:*)
-	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	echo alphaev5-cray-unicosmk"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
     CRAY*SV1:*:*:*)
-	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	echo sv1-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
     *:UNICOS/mp:*:*)
-	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	echo craynv-cray-unicosmp"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
     F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+	FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_REL=`echo "$UNAME_RELEASE" | sed -e 's/ /_/'`
 	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
 	exit ;;
     5000:UNIX_System_V:4.*:*)
-	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_REL=`echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
 	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
 	exit ;;
     i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
-	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	echo "$UNAME_MACHINE"-pc-bsdi"$UNAME_RELEASE"
 	exit ;;
     sparc*:BSD/OS:*:*)
-	echo sparc-${VENDOR}-bsdi${UNAME_RELEASE}
+	echo sparc-${VENDOR}-bsdi"$UNAME_RELEASE"
 	exit ;;
     *:BSD/OS:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-bsdi${UNAME_RELEASE}
+	echo "$UNAME_MACHINE"-${VENDOR}-bsdi"$UNAME_RELEASE"
 	exit ;;
     *:FreeBSD:*:*)
 	UNAME_PROCESSOR=`/usr/bin/uname -p`
-	case ${UNAME_PROCESSOR} in
+	case "$UNAME_PROCESSOR" in
 	    amd64)
-		echo x86_64-${VENDOR}-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	    *)
-		echo ${UNAME_PROCESSOR}-${VENDOR}-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+		UNAME_PROCESSOR=x86_64 ;;
+	    i386)
+		UNAME_PROCESSOR=i586 ;;
 	esac
+	echo "$UNAME_PROCESSOR"-${VENDOR}-freebsd"`echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`"
 	exit ;;
     i*:CYGWIN*:*)
-	echo ${UNAME_MACHINE}-pc-cygwin
+	echo "$UNAME_MACHINE"-pc-cygwin
 	exit ;;
     *:MINGW64*:*)
-	echo ${UNAME_MACHINE}-pc-mingw64
+	echo "$UNAME_MACHINE"-pc-mingw64
 	exit ;;
     *:MINGW*:*)
-	echo ${UNAME_MACHINE}-pc-mingw32
+	echo "$UNAME_MACHINE"-pc-mingw32
 	exit ;;
     *:MSYS*:*)
-	echo ${UNAME_MACHINE}-pc-msys
-	exit ;;
-    i*:windows32*:*)
-	# uname -m includes "-pc" on this system.
-	echo ${UNAME_MACHINE}-mingw32
+	echo "$UNAME_MACHINE"-pc-msys
 	exit ;;
     i*:PW*:*)
-	echo ${UNAME_MACHINE}-pc-pw32
+	echo "$UNAME_MACHINE"-pc-pw32
 	exit ;;
     *:Interix*:*)
-	case ${UNAME_MACHINE} in
+	case "$UNAME_MACHINE" in
 	    x86)
-		echo i586-pc-interix${UNAME_RELEASE}
+		echo i586-pc-interix"$UNAME_RELEASE"
 		exit ;;
 	    authenticamd | genuineintel | EM64T)
-		echo x86_64-${VENDOR}-interix${UNAME_RELEASE}
+		echo x86_64-${VENDOR}-interix"$UNAME_RELEASE"
 		exit ;;
 	    IA64)
-		echo ia64-${VENDOR}-interix${UNAME_RELEASE}
+		echo ia64-${VENDOR}-interix"$UNAME_RELEASE"
 		exit ;;
 	esac ;;
-    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
-	echo i${UNAME_MACHINE}-pc-mks
-	exit ;;
-    8664:Windows_NT:*)
-	echo x86_64-pc-mks
-	exit ;;
-    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
-	# UNAME_MACHINE based on the output of uname instead of i386?
-	echo i586-pc-interix
-	exit ;;
     i*:UWIN*:*)
-	echo ${UNAME_MACHINE}-pc-uwin
+	echo "$UNAME_MACHINE"-pc-uwin
 	exit ;;
     amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
 	echo x86_64-${VENDOR}-cygwin
 	exit ;;
-    p*:CYGWIN*:*)
-	echo powerpcle-${VENDOR}-cygwin
-	exit ;;
     prep*:SunOS:5.*:*)
-	echo powerpcle-${VENDOR}-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	echo powerpcle-${VENDOR}-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`"
 	exit ;;
     *:GNU:*:*)
 	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-${VENDOR}-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	echo "`echo "$UNAME_MACHINE"|sed -e 's,[-/].*$,,'`-${VENDOR}-$LIBC`echo "$UNAME_RELEASE"|sed -e 's,/.*$,,'`"
 	exit ;;
     *:GNU/*:*:*)
 	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-${VENDOR}-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
+	echo "$UNAME_MACHINE-${VENDOR}-`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`-$LIBC"
 	exit ;;
     i*86:Minix:*:*)
-	echo ${UNAME_MACHINE}-pc-minix
+	echo "$UNAME_MACHINE"-pc-minix
 	exit ;;
     aarch64:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     aarch64_be:Linux:*:*)
 	UNAME_MACHINE=aarch64_be
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     alpha:Linux:*:*)
 	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
@@ -912,58 +925,64 @@
 	  EV68*) UNAME_MACHINE=alphaev68 ;;
 	esac
 	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC="gnulibc1" ; fi
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     arc:Linux:*:* | arceb:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     arm*:Linux:*:*)
-	eval $set_cc_for_build
+	eval "$set_cc_for_build"
 	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
 	    | grep -q __ARM_EABI__
 	then
-	    echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	    echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	else
 	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
 		| grep -q __ARM_PCS_VFP
 	    then
-		echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}eabi
+		echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"eabi
 	    else
-		echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}eabihf
+		echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"eabihf
 	    fi
 	fi
 	exit ;;
     avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     cris:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
+	echo "$UNAME_MACHINE"-axis-linux-"$LIBC"
 	exit ;;
     crisv32:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
+	echo "$UNAME_MACHINE"-axis-linux-"$LIBC"
+	exit ;;
+    e2k:Linux:*:*)
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     frv:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     hexagon:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     i*86:Linux:*:*)
-	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+	echo "$UNAME_MACHINE"-pc-linux-"$LIBC"
 	exit ;;
     ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
+	exit ;;
+    k1om:Linux:*:*)
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     mips:Linux:*:* | mips64:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
+	eval "$set_cc_for_build"
+	sed 's/^	//' << EOF > "$dummy.c"
 	#undef CPU
 	#undef ${UNAME_MACHINE}
 	#undef ${UNAME_MACHINE}el
@@ -977,64 +996,70 @@
 	#endif
 	#endif
 EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
-	test x"${CPU}" != x && { echo "${CPU}-${VENDOR}-linux-${LIBC}"; exit; }
+	eval "`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU'`"
+	test "x$CPU" != x && { echo "$CPU-${VENDOR}-linux-$LIBC"; exit; }
 	;;
+    mips64el:Linux:*:*)
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
+	exit ;;
     openrisc*:Linux:*:*)
-	echo or1k-${VENDOR}-linux-${LIBC}
+	echo or1k-${VENDOR}-linux-"$LIBC"
 	exit ;;
     or32:Linux:*:* | or1k*:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     padre:Linux:*:*)
-	echo sparc-${VENDOR}-linux-${LIBC}
+	echo sparc-${VENDOR}-linux-"$LIBC"
 	exit ;;
     parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-${VENDOR}-linux-${LIBC}
+	echo hppa64-${VENDOR}-linux-"$LIBC"
 	exit ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
 	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-${VENDOR}-linux-${LIBC} ;;
-	  PA8*) echo hppa2.0-${VENDOR}-linux-${LIBC} ;;
-	  *)    echo hppa-${VENDOR}-linux-${LIBC} ;;
+	  PA7*) echo hppa1.1-${VENDOR}-linux-"$LIBC" ;;
+	  PA8*) echo hppa2.0-${VENDOR}-linux-"$LIBC" ;;
+	  *)    echo hppa-${VENDOR}-linux-"$LIBC" ;;
 	esac
 	exit ;;
     ppc64:Linux:*:*)
-	echo powerpc64-${VENDOR}-linux-${LIBC}
+	echo powerpc64-${VENDOR}-linux-"$LIBC"
 	exit ;;
     ppc:Linux:*:*)
-	echo powerpc-${VENDOR}-linux-${LIBC}
+	echo powerpc-${VENDOR}-linux-"$LIBC"
 	exit ;;
     ppc64le:Linux:*:*)
-	echo powerpc64le-${VENDOR}-linux-${LIBC}
+	echo powerpc64le-${VENDOR}-linux-"$LIBC"
 	exit ;;
     ppcle:Linux:*:*)
-	echo powerpcle-${VENDOR}-linux-${LIBC}
+	echo powerpcle-${VENDOR}-linux-"$LIBC"
+	exit ;;
+    riscv32:Linux:*:* | riscv64:Linux:*:*)
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
+	echo "$UNAME_MACHINE"-ibm-linux-"$LIBC"
 	exit ;;
     sh64*:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     tile*:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-${LIBC}
+	echo "$UNAME_MACHINE"-dec-linux-"$LIBC"
 	exit ;;
     x86_64:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-pc-linux-"$LIBC"
 	exit ;;
     xtensa*:Linux:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}
+	echo "$UNAME_MACHINE"-${VENDOR}-linux-"$LIBC"
 	exit ;;
     i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
@@ -1048,34 +1073,34 @@
 	# I am not positive that other SVR4 systems won't match this,
 	# I just have to hope.  -- rms.
 	# Use sysv4.2uw... so that sysv4* matches it.
-	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	echo "$UNAME_MACHINE"-pc-sysv4.2uw"$UNAME_VERSION"
 	exit ;;
     i*86:OS/2:*:*)
 	# If we were able to find `uname', then EMX Unix compatibility
 	# is probably installed.
-	echo ${UNAME_MACHINE}-pc-os2-emx
+	echo "$UNAME_MACHINE"-pc-os2-emx
 	exit ;;
     i*86:XTS-300:*:STOP)
-	echo ${UNAME_MACHINE}-${VENDOR}-stop
+	echo "$UNAME_MACHINE"-${VENDOR}-stop
 	exit ;;
     i*86:atheos:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-atheos
+	echo "$UNAME_MACHINE"-${VENDOR}-atheos
 	exit ;;
     i*86:syllable:*:*)
-	echo ${UNAME_MACHINE}-pc-syllable
+	echo "$UNAME_MACHINE"-pc-syllable
 	exit ;;
     i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
-	echo i386-${VENODR}-lynxos${UNAME_RELEASE}
+	echo i386-${VENDOR}-lynxos"$UNAME_RELEASE"
 	exit ;;
     i*86:*DOS:*:*)
-	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	echo "$UNAME_MACHINE"-pc-msdosdjgpp
 	exit ;;
-    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
-	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+    i*86:*:4.*:*)
+	UNAME_REL=`echo "$UNAME_RELEASE" | sed 's/\/MP$//'`
 	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+		echo "$UNAME_MACHINE"-univel-sysv"$UNAME_REL"
 	else
-		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+		echo "$UNAME_MACHINE"-pc-sysv"$UNAME_REL"
 	fi
 	exit ;;
     i*86:*:5:[678]*)
@@ -1085,12 +1110,12 @@
 	    *Pentium)	     UNAME_MACHINE=i586 ;;
 	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
 	esac
-	echo ${UNAME_MACHINE}-${VENDOR}-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	echo "$UNAME_MACHINE-${VENDOR}-sysv${UNAME_RELEASE}${UNAME_SYSTEM}{$UNAME_VERSION}"
 	exit ;;
     i*86:*:3.2:*)
 	if test -f /usr/options/cb.name; then
 		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
-		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+		echo "$UNAME_MACHINE"-pc-isc"$UNAME_REL"
 	elif /bin/uname -X 2>/dev/null >/dev/null ; then
 		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
 		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
@@ -1100,9 +1125,9 @@
 			&& UNAME_MACHINE=i686
 		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
 			&& UNAME_MACHINE=i686
-		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+		echo "$UNAME_MACHINE"-pc-sco"$UNAME_REL"
 	else
-		echo ${UNAME_MACHINE}-pc-sysv32
+		echo "$UNAME_MACHINE"-pc-sysv32
 	fi
 	exit ;;
     pc:*:*:*)
@@ -1110,7 +1135,7 @@
 	# uname -m prints for DJGPP always 'pc', but it prints nothing about
 	# the processor, so we play safe by assuming i586.
 	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configury will decide that
+	# prints for the "djgpp" host, or else GDB configure will decide that
 	# this is a cross-build.
 	echo i586-pc-msdosdjgpp
 	exit ;;
@@ -1122,9 +1147,9 @@
 	exit ;;
     i860:*:4.*:*) # i860-SVR4
 	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	  echo i860-stardent-sysv"$UNAME_RELEASE" # Stardent Vistra i860-SVR4
 	else # Add other i860-SVR4 vendors below as they are discovered.
-	  echo i860-${VENODR}-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	  echo i860-${VENDOR}-sysv"$UNAME_RELEASE"  # Unknown i860-SVR4
 	fi
 	exit ;;
     mini*:CTIX:SYS*5:*)
@@ -1144,9 +1169,9 @@
 	test -r /etc/.relid \
 	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
 	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	  && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
 	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+	  && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
     3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
 	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
 	  && { echo i486-ncr-sysv4; exit; } ;;
@@ -1155,28 +1180,28 @@
 	test -r /etc/.relid \
 	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
 	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	    && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
 	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; }
 	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
-	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
     m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
-	echo m68k-${VENDOR}-lynxos${UNAME_RELEASE}
+	echo m68k-${VENDOR}-lynxos"$UNAME_RELEASE"
 	exit ;;
     mc68030:UNIX_System_V:4.*:*)
 	echo m68k-atari-sysv4
 	exit ;;
     TSUNAMI:LynxOS:2.*:*)
-	echo sparc-${VENDOR}-lynxos${UNAME_RELEASE}
+	echo sparc-${VENDOR}-lynxos"$UNAME_RELEASE"
 	exit ;;
     rs6000:LynxOS:2.*:*)
-	echo rs6000-${VENDOR}-lynxos${UNAME_RELEASE}
+	echo rs6000-${VENDOR}-lynxos"$UNAME_RELEASE"
 	exit ;;
     PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
-	echo powerpc-${VENDOR}-lynxos${UNAME_RELEASE}
+	echo powerpc-${VENDOR}-lynxos"$UNAME_RELEASE"
 	exit ;;
     SM[BE]S:UNIX_SV:*:*)
-	echo mips-dde-sysv${UNAME_RELEASE}
+	echo mips-dde-sysv"$UNAME_RELEASE"
 	exit ;;
     RM*:ReliantUNIX-*:*:*)
 	echo mips-sni-sysv4
@@ -1187,7 +1212,7 @@
     *:SINIX-*:*:*)
 	if uname -p 2>/dev/null >/dev/null ; then
 		UNAME_MACHINE=`(uname -p) 2>/dev/null`
-		echo ${UNAME_MACHINE}-sni-sysv4
+		echo "$UNAME_MACHINE"-sni-sysv4
 	else
 		echo ns32k-sni-sysv
 	fi
@@ -1207,23 +1232,23 @@
 	exit ;;
     i*86:VOS:*:*)
 	# From Paul.Green@stratus.com.
-	echo ${UNAME_MACHINE}-stratus-vos
+	echo "$UNAME_MACHINE"-stratus-vos
 	exit ;;
     *:VOS:*:*)
 	# From Paul.Green@stratus.com.
 	echo hppa1.1-stratus-vos
 	exit ;;
     mc68*:A/UX:*:*)
-	echo m68k-apple-aux${UNAME_RELEASE}
+	echo m68k-apple-aux"$UNAME_RELEASE"
 	exit ;;
     news*:NEWS-OS:6*:*)
 	echo mips-sony-newsos6
 	exit ;;
     R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
 	if [ -d /usr/nec ]; then
-		echo mips-nec-sysv${UNAME_RELEASE}
+		echo mips-nec-sysv"$UNAME_RELEASE"
 	else
-		echo mips-${VENDOR}-sysv${UNAME_RELEASE}
+		echo mips-${VENDOR}-sysv"$UNAME_RELEASE"
 	fi
 	exit ;;
     BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
@@ -1242,46 +1267,56 @@
 	echo x86_64-${VENDOR}-haiku
 	exit ;;
     SX-4:SUPER-UX:*:*)
-	echo sx4-nec-superux${UNAME_RELEASE}
+	echo sx4-nec-superux"$UNAME_RELEASE"
 	exit ;;
     SX-5:SUPER-UX:*:*)
-	echo sx5-nec-superux${UNAME_RELEASE}
+	echo sx5-nec-superux"$UNAME_RELEASE"
 	exit ;;
     SX-6:SUPER-UX:*:*)
-	echo sx6-nec-superux${UNAME_RELEASE}
+	echo sx6-nec-superux"$UNAME_RELEASE"
 	exit ;;
     SX-7:SUPER-UX:*:*)
-	echo sx7-nec-superux${UNAME_RELEASE}
+	echo sx7-nec-superux"$UNAME_RELEASE"
 	exit ;;
     SX-8:SUPER-UX:*:*)
-	echo sx8-nec-superux${UNAME_RELEASE}
+	echo sx8-nec-superux"$UNAME_RELEASE"
 	exit ;;
     SX-8R:SUPER-UX:*:*)
-	echo sx8r-nec-superux${UNAME_RELEASE}
+	echo sx8r-nec-superux"$UNAME_RELEASE"
+	exit ;;
+    SX-ACE:SUPER-UX:*:*)
+	echo sxace-nec-superux"$UNAME_RELEASE"
 	exit ;;
     Power*:Rhapsody:*:*)
-	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	echo powerpc-apple-rhapsody"$UNAME_RELEASE"
 	exit ;;
     *:Rhapsody:*:*)
-	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	echo "$UNAME_MACHINE"-apple-rhapsody"$UNAME_RELEASE"
 	exit ;;
     *:Darwin:*:*)
 	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	eval $set_cc_for_build
+	eval "$set_cc_for_build"
 	if test "$UNAME_PROCESSOR" = unknown ; then
 	    UNAME_PROCESSOR=powerpc
 	fi
-	if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
-	    if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+	if test "`echo "$UNAME_RELEASE" | sed -e 's/\..*//'`" -le 10 ; then
+	    if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
 		if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		    (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		    grep IS_64BIT_ARCH >/dev/null
+		       (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		       grep IS_64BIT_ARCH >/dev/null
 		then
 		    case $UNAME_PROCESSOR in
 			i386) UNAME_PROCESSOR=x86_64 ;;
 			powerpc) UNAME_PROCESSOR=powerpc64 ;;
 		    esac
 		fi
+		# On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc
+		if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \
+		       (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		       grep IS_PPC >/dev/null
+		then
+		    UNAME_PROCESSOR=powerpc
+		fi
 	    fi
 	elif test "$UNAME_PROCESSOR" = i386 ; then
 	    # Avoid executing cc on OS X 10.9, as it ships with a stub
@@ -1292,27 +1327,33 @@
 	    # that Apple uses in portable devices.
 	    UNAME_PROCESSOR=x86_64
 	fi
-	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	echo "$UNAME_PROCESSOR"-apple-darwin"$UNAME_RELEASE"
 	exit ;;
     *:procnto*:*:* | *:QNX:[0123456789]*:*)
 	UNAME_PROCESSOR=`uname -p`
-	if test "$UNAME_PROCESSOR" = "x86"; then
+	if test "$UNAME_PROCESSOR" = x86; then
 		UNAME_PROCESSOR=i386
 		UNAME_MACHINE=pc
 	fi
-	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	echo "$UNAME_PROCESSOR"-"$UNAME_MACHINE"-nto-qnx"$UNAME_RELEASE"
 	exit ;;
     *:QNX:*:4*)
 	echo i386-pc-qnx
 	exit ;;
-    NEO-?:NONSTOP_KERNEL:*:*)
-	echo neo-tandem-nsk${UNAME_RELEASE}
+    NEO-*:NONSTOP_KERNEL:*:*)
+	echo neo-tandem-nsk"$UNAME_RELEASE"
 	exit ;;
     NSE-*:NONSTOP_KERNEL:*:*)
-	echo nse-tandem-nsk${UNAME_RELEASE}
+	echo nse-tandem-nsk"$UNAME_RELEASE"
+	exit ;;
+    NSR-*:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk"$UNAME_RELEASE"
 	exit ;;
-    NSR-?:NONSTOP_KERNEL:*:*)
-	echo nsr-tandem-nsk${UNAME_RELEASE}
+    NSV-*:NONSTOP_KERNEL:*:*)
+	echo nsv-tandem-nsk"$UNAME_RELEASE"
+	exit ;;
+    NSX-*:NONSTOP_KERNEL:*:*)
+	echo nsx-tandem-nsk"$UNAME_RELEASE"
 	exit ;;
     *:NonStop-UX:*:*)
 	echo mips-compaq-nonstopux
@@ -1321,18 +1362,18 @@
 	echo bs2000-siemens-sysv
 	exit ;;
     DS/*:UNIX_System_V:*:*)
-	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	echo "$UNAME_MACHINE"-"$UNAME_SYSTEM"-"$UNAME_RELEASE"
 	exit ;;
     *:Plan9:*:*)
 	# "uname -m" is not consistent, so use $cputype instead. 386
 	# is converted to i386 for consistency with other x86
 	# operating systems.
-	if test "$cputype" = "386"; then
+	if test "$cputype" = 386; then
 	    UNAME_MACHINE=i386
 	else
 	    UNAME_MACHINE="$cputype"
 	fi
-	echo ${UNAME_MACHINE}-${VENDOR}-plan9
+	echo "$UNAME_MACHINE"-${VENDOR}-plan9
 	exit ;;
     *:TOPS-10:*:*)
 	echo pdp10-${VENDOR}-tops10
@@ -1353,14 +1394,14 @@
 	echo pdp10-${VENDOR}-its
 	exit ;;
     SEI:*:*:SEIUX)
-	echo mips-sei-seiux${UNAME_RELEASE}
+	echo mips-sei-seiux"$UNAME_RELEASE"
 	exit ;;
     *:DragonFly:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	echo "$UNAME_MACHINE"-${VENDOR}-dragonfly"`echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`"
 	exit ;;
     *:*VMS:*:*)
 	UNAME_MACHINE=`(uname -p) 2>/dev/null`
-	case "${UNAME_MACHINE}" in
+	case "$UNAME_MACHINE" in
 	    A*) echo alpha-dec-vms ; exit ;;
 	    I*) echo ia64-dec-vms ; exit ;;
 	    V*) echo vax-dec-vms ; exit ;;
@@ -1369,34 +1410,48 @@
 	echo i386-pc-xenix
 	exit ;;
     i*86:skyos:*:*)
-	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+	echo "$UNAME_MACHINE"-pc-skyos"`echo "$UNAME_RELEASE" | sed -e 's/ .*$//'`"
 	exit ;;
     i*86:rdos:*:*)
-	echo ${UNAME_MACHINE}-pc-rdos
+	echo "$UNAME_MACHINE"-pc-rdos
 	exit ;;
     i*86:AROS:*:*)
-	echo ${UNAME_MACHINE}-pc-aros
+	echo "$UNAME_MACHINE"-pc-aros
 	exit ;;
     x86_64:VMkernel:*:*)
-	echo ${UNAME_MACHINE}-${VENDOR}-esx
+	echo "$UNAME_MACHINE"-${VENDOR}-esx
+	exit ;;
+    amd64:Isilon\ OneFS:*:*)
+	echo x86_64-${VENDOR}-onefs
 	exit ;;
 esac
 
+echo "$0: unable to guess system type" >&2
+
+case "$UNAME_MACHINE:$UNAME_SYSTEM" in
+    mips:Linux | mips64:Linux)
+	# If we got here on MIPS GNU/Linux, output extra information.
+	cat >&2 <<EOF
+
+NOTE: MIPS GNU/Linux systems require a C compiler to fully recognize
+the system type. Please install a C compiler and try again.
+EOF
+	;;
+esac
+
 cat >&2 <<EOF
-$0: unable to guess system type
 
-This script, last modified $timestamp, has failed to recognize
-the operating system you are using. It is advised that you
-download the most up to date version of the config scripts from
+This script (version $timestamp), has failed to recognize the
+operating system you are using. If your script is old, overwrite *all*
+copies of config.guess and config.sub with the latest versions from:
 
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+  https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
 and
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+  https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
 
-If the version you run ($0) is already up to date, please
-send the following data and any information you think might be
-pertinent to <config-patches@gnu.org> in order to provide the needed
-information to handle your system.
+If $0 has already been updated, send the following data and any
+information you think might be pertinent to config-patches@gnu.org to
+provide the necessary information to handle your system.
 
 config.guess timestamp = $timestamp
 
@@ -1415,16 +1470,16 @@
 /usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
 /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
 
-UNAME_MACHINE = ${UNAME_MACHINE}
-UNAME_RELEASE = ${UNAME_RELEASE}
-UNAME_SYSTEM  = ${UNAME_SYSTEM}
-UNAME_VERSION = ${UNAME_VERSION}
+UNAME_MACHINE = "$UNAME_MACHINE"
+UNAME_RELEASE = "$UNAME_RELEASE"
+UNAME_SYSTEM  = "$UNAME_SYSTEM"
+UNAME_VERSION = "$UNAME_VERSION"
 EOF
 
 exit 1
 
 # Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "timestamp='"
 # time-stamp-format: "%:y-%02m-%02d"
 # time-stamp-end: "'"
diff -Nru elpa-2016.05.001/config.h.in elpa-2019.11.001/config.h.in
--- elpa-2016.05.001/config.h.in	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/config.h.in	2019-12-21 16:29:45.000000000 +0000
@@ -1,7 +1,53 @@
 /* config.h.in.  Generated from configure.ac by autoheader.  */
 
-/* use assumed size arrays, even if not debuggable */
-#undef DESPERATELY_WANT_ASSUMED_SIZE
+/* use blocking in trans_ev_band_to_full */
+#undef BAND_TO_FULL_BLOCKING
+
+/* build for K-Computer */
+#undef BUILD_KCOMPUTER
+
+/* build for SX-Aurora */
+#undef BUILD_SXAURORA
+
+/* "Current ELPA API version" */
+#undef CURRENT_API_VERSION
+
+/* "Current ELPA autotune version" */
+#undef CURRENT_AUTOTUNE_VERSION
+
+/* enable CUDA debugging */
+#undef DEBUG_CUDA
+
+/* Earliest supported ELPA API version */
+#undef EARLIEST_API_VERSION
+
+/* Earliest ELPA API version, which supports autotuning */
+#undef EARLIEST_AUTOTUNE_VERSION
+
+/* enable autotuning functionality */
+#undef ENABLE_AUTOTUNING
+
+/* enable C tests */
+#undef ENABLE_C_TESTS
+
+/* allow to link against the 64bit integer versions of math libraries */
+#undef HAVE_64BIT_INTEGER_MATH_SUPPORT
+
+/* allow to link against the 64bit integer versions of the MPI library */
+#undef HAVE_64BIT_INTEGER_MPI_SUPPORT
+
+/* Define to 1 to support Advanced Bit Manipulation */
+#undef HAVE_ABM
+
+/* Define to 1 to support Multi-Precision Add-Carry Instruction Extensions */
+#undef HAVE_ADX
+
+/* Define to 1 to support Advanced Encryption Standard New Instruction Set
+   (AES-NI) */
+#undef HAVE_AES
+
+/* Support Altivec instructions */
+#undef HAVE_ALTIVEC
 
 /* AVX is supported on this CPU */
 #undef HAVE_AVX
@@ -9,15 +55,66 @@
 /* AVX2 is supported on this CPU */
 #undef HAVE_AVX2
 
-/* Enable more timings */
+/* AVX512 is supported on this CPU */
+#undef HAVE_AVX512
+
+/* Define to 1 to support AVX-512 Byte and Word Instructions */
+#undef HAVE_AVX512_BW
+
+/* Define to 1 to support AVX-512 Conflict Detection Instructions */
+#undef HAVE_AVX512_CD
+
+/* Define to 1 to support AVX-512 Doubleword and Quadword Instructions */
+#undef HAVE_AVX512_DQ
+
+/* Define to 1 to support AVX-512 Exponential & Reciprocal Instructions */
+#undef HAVE_AVX512_ER
+
+/* Define to 1 to support AVX-512 Foundation Extensions */
+#undef HAVE_AVX512_F
+
+/* Define to 1 to support AVX-512 Integer Fused Multiply Add Instructions */
+#undef HAVE_AVX512_IFMA
+
+/* Define to 1 to support AVX-512 Conflict Prefetch Instructions */
+#undef HAVE_AVX512_PF
+
+/* Define to 1 to support AVX-512 Vector Byte Manipulation Instructions */
+#undef HAVE_AVX512_VBMI
+
+/* Define to 1 to support AVX-512 Vector Length Extensions */
+#undef HAVE_AVX512_VL
+
+/* AVX512 for Xeon is supported on this CPU */
+#undef HAVE_AVX512_XEON
+
+/* AVX512 for Xeon-PHI is supported on this CPU */
+#undef HAVE_AVX512_XEON_PHI
+
+/* Define to 1 to support Bit Manipulation Instruction Set 1 */
+#undef HAVE_BMI1
+
+/* Define to 1 to support Bit Manipulation Instruction Set 2 */
+#undef HAVE_BMI2
+
+/* Enable more timing */
 #undef HAVE_DETAILED_TIMINGS
 
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #undef HAVE_DLFCN_H
 
-/* Fortran can querry environment variables */
+/* Fortran can query environment variables */
 #undef HAVE_ENVIRONMENT_CHECKING
 
+/* Define to 1 to support Fused Multiply-Add Extensions 3 */
+#undef HAVE_FMA3
+
+/* Define to 1 to support Fused Multiply-Add Extensions 4 */
+#undef HAVE_FMA4
+
+/* automatically support clusters with different Intel CPUs */
+#undef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
@@ -27,18 +124,66 @@
 /* Use the PAPI library */
 #undef HAVE_LIBPAPI
 
+/* Use likwid */
+#undef HAVE_LIKWID
+
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 
+/* Define to 1 to support Multimedia Extensions */
+#undef HAVE_MMX
+
+/* can use the Fortran mpi module */
+#undef HAVE_MPI_MODULE
+
+/* Define to 1 to support Memory Protection Extensions */
+#undef HAVE_MPX
+
+/* NEON_ARCH64 intrinsics are supported on this CPU */
+#undef HAVE_NEON_ARCH64_SSE
+
+/* Define to 1 to support Prefetch Vector Data Into Caches WT1 */
+#undef HAVE_PREFETCHWT1
+
+/* Define to 1 to support Digital Random Number Generator */
+#undef HAVE_RDRND
+
 /* Redirect stdout and stderr of test programs per MPI tasks to a file */
 #undef HAVE_REDIRECT
 
-/* assembly SSE is supported on this CPU */
-#undef HAVE_SSE_ASSEMBLY
+/* Define to 1 to support Secure Hash Algorithm Extension */
+#undef HAVE_SHA
+
+/* build for skewsyemmtric case */
+#undef HAVE_SKEWSYMMETRIC
+
+/* SPARC64 intrinsics are supported on this CPU */
+#undef HAVE_SPARC64_SSE
+
+/* Define to 1 to support Streaming SIMD Extensions */
+#undef HAVE_SSE
+
+/* Define to 1 to support Streaming SIMD Extensions */
+#undef HAVE_SSE2
+
+/* Define to 1 to support Streaming SIMD Extensions 3 */
+#undef HAVE_SSE3
+
+/* Define to 1 to support Streaming SIMD Extensions 4.1 */
+#undef HAVE_SSE4_1
+
+/* Define to 1 to support Streaming SIMD Extensions 4.2 */
+#undef HAVE_SSE4_2
+
+/* Define to 1 to support AMD Streaming SIMD Extensions 4a */
+#undef HAVE_SSE4a
 
 /* gcc intrinsics SSE is supported on this CPU */
 #undef HAVE_SSE_INTRINSICS
 
+/* Define to 1 to support Supplemental Streaming SIMD Extensions 3 */
+#undef HAVE_SSSE3
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
@@ -60,9 +205,21 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 
+/* Support VSX instructions */
+#undef HAVE_VSX
+
+/* Altivec VSX intrinsics are supported on this CPU */
+#undef HAVE_VSX_SSE
+
+/* Define to 1 to support eXtended Operations Extensions */
+#undef HAVE_XOP
+
 /* Define to the sub-directory where libtool stores uninstalled libraries. */
 #undef LT_OBJDIR
 
+/* enable error argument in C-API to be optional */
+#undef OPTIONAL_C_ERROR_ARGUMENT
+
 /* Name of package */
 #undef PACKAGE
 
@@ -84,95 +241,177 @@
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
+/* Work around a PGI bug with variable-length string results */
+#undef PGI_VARIABLE_STRING_BUG
+
+/* The size of `long int', as computed by sizeof. */
+#undef SIZEOF_LONG_INT
+
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 
+/* compile build config into the library object */
+#undef STORE_BUILD_CONFIG
+
+/* for performance reasons use assumed size Fortran arrays, even if not
+   debuggable */
+#undef USE_ASSUMED_SIZE
+
+/* use some Fortran 2008 features */
+#undef USE_FORTRAN2008
+
 /* Version number of package */
 #undef VERSION
 
-/* can use complex_avx2_block1 kernel */
+/* build also single-precision for complex calculation */
+#undef WANT_SINGLE_PRECISION_COMPLEX
+
+/* build also single-precision for real calculation */
+#undef WANT_SINGLE_PRECISION_REAL
+
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_AVX2_BLOCK1_KERNEL
 
-/* can use complex_avx2_block2 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_AVX2_BLOCK2_KERNEL
 
-/* can use complex_avx_block1 kernel */
+/* Build elpa_m4_kernel kernel */
+#undef WITH_COMPLEX_AVX512_BLOCK1_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_COMPLEX_AVX512_BLOCK2_KERNEL
+
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_AVX_BLOCK1_KERNEL
 
-/* can use complex_avx_block2 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_AVX_BLOCK2_KERNEL
 
-/* can use complex generic kernel */
+/* Build elpa_m4_kernel kernel */
+#undef WITH_COMPLEX_BGP_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_COMPLEX_BGQ_KERNEL
+
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_GENERIC_KERNEL
 
-/* can use complex generic-simple kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
 
-/* can use complex SSE assembly kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
 
-/* can use complex_sse_block1 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_SSE_BLOCK1_KERNEL
 
-/* can use complex_sse_block2 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_COMPLEX_SSE_BLOCK2_KERNEL
 
-/* use MPI */
-#undef WITH_MPI
+/* use only one specific complex kernel (set at compile time) */
+#undef WITH_FIXED_COMPLEX_KERNEL
 
-/* do not use only one specific complex kernel (set at compile time) */
-#undef WITH_NO_SPECIFIC_COMPLEX_KERNEL
+/* use only one specific real kernel (set at compile time) */
+#undef WITH_FIXED_REAL_KERNEL
 
-/* do not use only one specific real kernel (set at compile time) */
-#undef WITH_NO_SPECIFIC_REAL_KERNEL
+/* enable GPU support */
+#undef WITH_GPU_VERSION
 
-/* use only one specific complex kernel (set at compile time) */
-#undef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
+/* use MPI */
+#undef WITH_MPI
 
-/* use only one specific real kernel (set at compile time) */
-#undef WITH_ONE_SPECIFIC_REAL_KERNEL
+/* enable NVTX support */
+#undef WITH_NVTX
 
 /* use OpenMP threading */
 #undef WITH_OPENMP
 
-/* can use real_avx2_block2 kernel */
+/* build and install python wrapper */
+#undef WITH_PYTHON
+
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_AVX2_BLOCK2_KERNEL
 
-/* can use real_avx2_block4 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_AVX2_BLOCK4_KERNEL
 
-/* can use real_avx2_block6 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_AVX2_BLOCK6_KERNEL
 
-/* can use real_avx_block2 kernel */
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_AVX512_BLOCK2_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_AVX512_BLOCK4_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_AVX512_BLOCK6_KERNEL
+
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_AVX_BLOCK2_KERNEL
 
-/* can use real_avx_block4 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_AVX_BLOCK4_KERNEL
 
-/* can use real_avx_block6 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_AVX_BLOCK6_KERNEL
 
-/* can use real BGP kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_BGP_KERNEL
 
-/* can use real BGQ kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_BGQ_KERNEL
 
-/* can use real generic kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_GENERIC_KERNEL
 
-/* can use real generic-simple kernel */
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
+
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_GENERIC_SIMPLE_KERNEL
 
-/* can use real SSE assembly kernel */
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_SPARC64_BLOCK2_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_SPARC64_BLOCK4_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_SPARC64_BLOCK6_KERNEL
+
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_SSE_ASSEMBLY_KERNEL
 
-/* can use real_sse_block2 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_SSE_BLOCK2_KERNEL
 
-/* can use real_sse_block4 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_SSE_BLOCK4_KERNEL
 
-/* can use real_sse_block6 kernel */
+/* Build elpa_m4_kernel kernel */
 #undef WITH_REAL_SSE_BLOCK6_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_VSX_BLOCK2_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_VSX_BLOCK4_KERNEL
+
+/* Build elpa_m4_kernel kernel */
+#undef WITH_REAL_VSX_BLOCK6_KERNEL
+
+/* build SCALAPACK test cases */
+#undef WITH_SCALAPACK_TESTS
diff -Nru elpa-2016.05.001/config.sub elpa-2019.11.001/config.sub
--- elpa-2016.05.001/config.sub	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/config.sub	2019-12-21 16:29:46.000000000 +0000
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2014 Free Software Foundation, Inc.
+#   Copyright 1992-2018 Free Software Foundation, Inc.
 
-timestamp='2014-12-03'
+timestamp='2018-03-08'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -15,7 +15,7 @@
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -33,7 +33,7 @@
 # Otherwise, we print the canonical config type on stdout and succeed.
 
 # You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
 
 # This file is supposed to be the same for all GNU packages
 # and recognize all the CPU types, system types and aliases
@@ -53,12 +53,11 @@
 me=`echo "$0" | sed -e 's,.*/,,'`
 
 usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS
-       $0 [OPTION] ALIAS
+Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
 
 Canonicalize a configuration name.
 
-Operation modes:
+Options:
   -h, --help         print this help, then exit
   -t, --time-stamp   print date of last modification, then exit
   -v, --version      print version number, then exit
@@ -68,7 +67,7 @@
 version="\
 GNU config.sub ($timestamp)
 
-Copyright 1992-2014 Free Software Foundation, Inc.
+Copyright 1992-2018 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -95,7 +94,7 @@
 
     *local*)
        # First pass through any local machine types.
-       echo $1
+       echo "$1"
        exit ;;
 
     * )
@@ -113,24 +112,24 @@
 
 # Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
 # Here we must recognize all the valid KERNEL-OS combinations.
-maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+maybe_os=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
   nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
   linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
-  knetbsd*-gnu* | netbsd*-gnu* | \
-  kopensolaris*-gnu* | \
+  knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
+  kopensolaris*-gnu* | cloudabi*-eabi* | \
   storm-chaos* | os2-emx* | rtmk-nova*)
     os=-$maybe_os
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    basic_machine=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
     ;;
   android-linux)
     os=-linux-android
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown
+    basic_machine=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown
     ;;
   *)
-    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
-    if [ $basic_machine != $1 ]
-    then os=`echo $1 | sed 's/.*-/-/'`
+    basic_machine=`echo "$1" | sed 's/-[^-]*$//'`
+    if [ "$basic_machine" != "$1" ]
+    then os=`echo "$1" | sed 's/.*-/-/'`
     else os=; fi
     ;;
 esac
@@ -179,44 +178,44 @@
 		;;
 	-sco6)
 		os=-sco5v6
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-sco5)
 		os=-sco3.2v5
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-sco4)
 		os=-sco3.2v4
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-sco3.2.[4-9]*)
 		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-sco3.2v[4-9]*)
 		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-sco5v6*)
 		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-sco*)
 		os=-sco3.2v2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-udk*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-isc)
 		os=-isc2.2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-clix*)
 		basic_machine=clipper-intergraph
 		;;
 	-isc*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'`
 		;;
 	-lynx*178)
 		os=-lynxos178
@@ -228,10 +227,7 @@
 		os=-lynxos
 		;;
 	-ptx*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
-		;;
-	-windowsnt*)
-		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		basic_machine=`echo "$1" | sed -e 's/86-.*/86-sequent/'`
 		;;
 	-psos*)
 		os=-psos
@@ -255,15 +251,16 @@
 	| arc | arceb \
 	| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
 	| avr | avr32 \
+	| ba \
 	| be32 | be64 \
 	| bfin \
 	| c4x | c8051 | clipper \
 	| d10v | d30v | dlx | dsp16xx \
-	| epiphany \
-	| fido | fr30 | frv \
+	| e2k | epiphany \
+	| fido | fr30 | frv | ft32 \
 	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
 	| hexagon \
-	| i370 | i860 | i960 | ia64 \
+	| i370 | i860 | i960 | ia16 | ia64 \
 	| ip2k | iq2000 \
 	| k1om \
 	| le32 | le64 \
@@ -299,13 +296,14 @@
 	| nios | nios2 | nios2eb | nios2el \
 	| ns16k | ns32k \
 	| open8 | or1k | or1knd | or32 \
-	| pdp10 | pdp11 | pj | pjl \
+	| pdp10 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
+	| pru \
 	| pyramid \
 	| riscv32 | riscv64 \
 	| rl78 | rx \
 	| score \
-	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
 	| sh64 | sh64le \
 	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
 	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@@ -314,7 +312,7 @@
 	| ubicom32 \
 	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
 	| visium \
-	| we32k \
+	| wasm32 \
 	| x86 | xc16x | xstormy16 | xtensa \
 	| z8k | z80)
 		basic_machine=$basic_machine-unknown
@@ -335,7 +333,7 @@
 		basic_machine=$basic_machine-unknown
 		os=-none
 		;;
-	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65)
 		;;
 	ms1)
 		basic_machine=mt-unknown
@@ -364,7 +362,7 @@
 	  ;;
 	# Object if more than one company name word.
 	*-*-*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		echo Invalid configuration \`"$1"\': machine \`"$basic_machine"\' not recognized 1>&2
 		exit 1
 		;;
 	# Recognize the basic CPU types with company name.
@@ -376,17 +374,18 @@
 	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
 	| avr-* | avr32-* \
+	| ba-* \
 	| be32-* | be64-* \
 	| bfin-* | bs2000-* \
 	| c[123]* | c30-* | [cjt]90-* | c4x-* \
 	| c8051-* | clipper-* | craynv-* | cydra-* \
 	| d10v-* | d30v-* | dlx-* \
-	| elxsi-* \
+	| e2k-* | elxsi-* \
 	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
 	| h8300-* | h8500-* \
 	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
 	| hexagon-* \
-	| i*86-* | i860-* | i960-* | ia64-* \
+	| i*86-* | i860-* | i960-* | ia16-* | ia64-* \
 	| ip2k-* | iq2000-* \
 	| k1om-* \
 	| le32-* | le64-* \
@@ -427,13 +426,15 @@
 	| orion-* \
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
+	| pru-* \
 	| pyramid-* \
+	| riscv32-* | riscv64-* \
 	| rl78-* | romp-* | rs6000-* | rx-* \
 	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
 	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
 	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
 	| sparclite-* \
-	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
 	| tahoe-* \
 	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
 	| tile*-* \
@@ -442,6 +443,7 @@
 	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
 	| vax-* \
 	| visium-* \
+	| wasm32-* \
 	| we32k-* \
 	| x86-* | x86_64-* | xc16x-* | xps100-* \
 	| xstormy16-* | xtensa*-* \
@@ -455,7 +457,7 @@
 	# Recognize the various machine names and aliases which stand
 	# for a CPU type and a company and sometimes even an OS.
 	386bsd)
-		basic_machine=i386-unknown
+		basic_machine=i386-pc
 		os=-bsd
 		;;
 	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
@@ -489,7 +491,7 @@
 		basic_machine=x86_64-pc
 		;;
 	amd64-*)
-		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=x86_64-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	amdahl)
 		basic_machine=580-amdahl
@@ -518,6 +520,9 @@
 		basic_machine=i386-pc
 		os=-aros
 		;;
+	asmjs)
+		basic_machine=asmjs-unknown
+		;;
 	aux)
 		basic_machine=m68k-apple
 		os=-aux
@@ -531,7 +536,7 @@
 		os=-linux
 		;;
 	blackfin-*)
-		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=bfin-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		os=-linux
 		;;
 	bluegene*)
@@ -539,13 +544,13 @@
 		os=-cnk
 		;;
 	c54x-*)
-		basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=tic54x-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	c55x-*)
-		basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=tic55x-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	c6x-*)
-		basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=tic6x-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	c90)
 		basic_machine=c90-cray
@@ -634,10 +639,18 @@
 		basic_machine=rs6000-bull
 		os=-bosx
 		;;
-	dpx2* | dpx2*-bull)
+	dpx2*)
 		basic_machine=m68k-bull
 		os=-sysv3
 		;;
+	e500v[12])
+		basic_machine=powerpc-unknown
+		os=$os"spe"
+		;;
+	e500v[12]-*)
+		basic_machine=powerpc-`echo "$basic_machine" | sed 's/^[^-]*-//'`
+		os=$os"spe"
+		;;
 	ebmon29k)
 		basic_machine=a29k-amd
 		os=-ebmon
@@ -727,9 +740,6 @@
 	hp9k8[0-9][0-9] | hp8[0-9][0-9])
 		basic_machine=hppa1.0-hp
 		;;
-	hppa-next)
-		os=-nextstep3
-		;;
 	hppaosf)
 		basic_machine=hppa1.1-hp
 		os=-osf
@@ -742,26 +752,26 @@
 		basic_machine=i370-ibm
 		;;
 	i*86v32)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'`
 		os=-sysv32
 		;;
 	i*86v4*)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'`
 		os=-sysv4
 		;;
 	i*86v)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'`
 		os=-sysv
 		;;
 	i*86sol2)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'`
 		os=-solaris2
 		;;
 	i386mach)
 		basic_machine=i386-mach
 		os=-mach
 		;;
-	i386-vsta | vsta)
+	vsta)
 		basic_machine=i386-unknown
 		os=-vsta
 		;;
@@ -780,19 +790,16 @@
 		os=-sysv
 		;;
 	leon-*|leon[3-9]-*)
-		basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
+		basic_machine=sparc-`echo "$basic_machine" | sed 's/-.*//'`
 		;;
 	m68knommu)
 		basic_machine=m68k-unknown
 		os=-linux
 		;;
 	m68knommu-*)
-		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=m68k-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		os=-linux
 		;;
-	m88k-omron*)
-		basic_machine=m88k-omron
-		;;
 	magnum | m3230)
 		basic_machine=mips-mips
 		os=-sysv
@@ -824,10 +831,10 @@
 		os=-mint
 		;;
 	mips3*-*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		basic_machine=`echo "$basic_machine" | sed -e 's/mips3/mips64/'`
 		;;
 	mips3*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		basic_machine=`echo "$basic_machine" | sed -e 's/mips3/mips64/'`-unknown
 		;;
 	monitor)
 		basic_machine=m68k-rom68k
@@ -846,7 +853,7 @@
 		os=-msdos
 		;;
 	ms1-*)
-		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
+		basic_machine=`echo "$basic_machine" | sed -e 's/ms1-/mt-/'`
 		;;
 	msys)
 		basic_machine=i686-pc
@@ -888,7 +895,7 @@
 		basic_machine=v70-nec
 		os=-sysv
 		;;
-	next | m*-next )
+	next | m*-next)
 		basic_machine=m68k-next
 		case $os in
 		    -nextstep* )
@@ -933,6 +940,12 @@
 	nsr-tandem)
 		basic_machine=nsr-tandem
 		;;
+	nsv-tandem)
+		basic_machine=nsv-tandem
+		;;
+	nsx-tandem)
+		basic_machine=nsx-tandem
+		;;
 	op50n-* | op60c-*)
 		basic_machine=hppa1.1-oki
 		os=-proelf
@@ -965,7 +978,7 @@
 		os=-linux
 		;;
 	parisc-*)
-		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=hppa-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		os=-linux
 		;;
 	pbd)
@@ -981,7 +994,7 @@
 		basic_machine=i386-pc
 		;;
 	pc98-*)
-		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=i386-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	pentium | p5 | k5 | k6 | nexgen | viac3)
 		basic_machine=i586-pc
@@ -996,16 +1009,16 @@
 		basic_machine=i786-pc
 		;;
 	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
-		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=i586-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	pentiumpro-* | p6-* | 6x86-* | athlon-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=i686-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=i686-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	pentium4-*)
-		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=i786-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	pn)
 		basic_machine=pn-gould
@@ -1015,23 +1028,23 @@
 	ppc | ppcbe)	basic_machine=powerpc-unknown
 		;;
 	ppc-* | ppcbe-*)
-		basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=powerpc-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
-	ppcle | powerpclittle | ppc-le | powerpc-little)
+	ppcle | powerpclittle)
 		basic_machine=powerpcle-unknown
 		;;
 	ppcle-* | powerpclittle-*)
-		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=powerpcle-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	ppc64)	basic_machine=powerpc64-unknown
 		;;
-	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+	ppc64-*) basic_machine=powerpc64-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
-	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
+	ppc64le | powerpc64little)
 		basic_machine=powerpc64le-unknown
 		;;
 	ppc64le-* | powerpc64little-*)
-		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=powerpc64le-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	ps2)
 		basic_machine=i386-ibm
@@ -1058,12 +1071,18 @@
 	rtpc | rtpc-*)
 		basic_machine=romp-ibm
 		;;
-	s390 | s390-*)
+	s390)
 		basic_machine=s390-ibm
 		;;
-	s390x | s390x-*)
+	s390-*)
+		basic_machine=s390-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	s390x)
 		basic_machine=s390x-ibm
 		;;
+	s390x-*)
+		basic_machine=s390x-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
 	sa29200)
 		basic_machine=a29k-amd
 		os=-udi
@@ -1085,17 +1104,10 @@
 	sequent)
 		basic_machine=i386-sequent
 		;;
-	sh)
-		basic_machine=sh-hitachi
-		os=-hms
-		;;
 	sh5el)
 		basic_machine=sh5le-unknown
 		;;
-	sh64)
-		basic_machine=sh64-unknown
-		;;
-	sparclite-wrs | simso-wrs)
+	simso-wrs)
 		basic_machine=sparclite-wrs
 		os=-vxworks
 		;;
@@ -1114,7 +1126,7 @@
 		os=-sysv4
 		;;
 	strongarm-* | thumb-*)
-		basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'`
+		basic_machine=arm-`echo "$basic_machine" | sed 's/^[^-]*-//'`
 		;;
 	sun2)
 		basic_machine=m68000-sun
@@ -1236,6 +1248,9 @@
 		basic_machine=hppa1.1-winbond
 		os=-proelf
 		;;
+	x64)
+		basic_machine=x86_64-pc
+		;;
 	xbox)
 		basic_machine=i686-pc
 		os=-mingw32
@@ -1244,20 +1259,12 @@
 		basic_machine=xps100-honeywell
 		;;
 	xscale-* | xscalee[bl]-*)
-		basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'`
+		basic_machine=`echo "$basic_machine" | sed 's/^xscale/arm/'`
 		;;
 	ymp)
 		basic_machine=ymp-cray
 		os=-unicos
 		;;
-	z8k-*-coff)
-		basic_machine=z8k-unknown
-		os=-sim
-		;;
-	z80-*-coff)
-		basic_machine=z80-unknown
-		os=-sim
-		;;
 	none)
 		basic_machine=none-none
 		os=-none
@@ -1286,10 +1293,6 @@
 	vax)
 		basic_machine=vax-dec
 		;;
-	pdp10)
-		# there are many clones, so DEC is not a safe bet
-		basic_machine=pdp10-unknown
-		;;
 	pdp11)
 		basic_machine=pdp11-dec
 		;;
@@ -1299,9 +1302,6 @@
 	sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
 		basic_machine=sh-unknown
 		;;
-	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
-		basic_machine=sparc-sun
-		;;
 	cydra)
 		basic_machine=cydra-cydrome
 		;;
@@ -1321,7 +1321,7 @@
 		# Make sure to match an already-canonicalized machine name.
 		;;
 	*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		echo Invalid configuration \`"$1"\': machine \`"$basic_machine"\' not recognized 1>&2
 		exit 1
 		;;
 esac
@@ -1329,10 +1329,10 @@
 # Here we canonicalize certain aliases for manufacturers.
 case $basic_machine in
 	*-digital*)
-		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		basic_machine=`echo "$basic_machine" | sed 's/digital.*/dec/'`
 		;;
 	*-commodore*)
-		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		basic_machine=`echo "$basic_machine" | sed 's/commodore.*/cbm/'`
 		;;
 	*)
 		;;
@@ -1343,8 +1343,8 @@
 if [ x"$os" != x"" ]
 then
 case $os in
-	# First match some system type aliases
-	# that might get confused with valid system types.
+	# First match some system type aliases that might get confused
+	# with valid system types.
 	# -solaris* is a basic system type, with this one exception.
 	-auroraux)
 		os=-auroraux
@@ -1355,45 +1355,48 @@
 	-solaris)
 		os=-solaris2
 		;;
-	-svr4*)
-		os=-sysv4
-		;;
 	-unixware*)
 		os=-sysv4.2uw
 		;;
 	-gnu/linux*)
 		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
 		;;
-	# First accept the basic system types.
+	# es1800 is here to avoid being matched by es* (a different OS)
+	-es1800*)
+		os=-ose
+		;;
+	# Now accept the basic system types.
 	# The portable systems comes first.
-	# Each alternative MUST END IN A *, to match a version number.
+	# Each alternative MUST end in a * to match a version number.
 	# -sysv* is not here because it comes later, after sysvr4.
 	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
 	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
 	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
 	      | -sym* | -kopensolaris* | -plan9* \
 	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* | -aros* \
+	      | -aos* | -aros* | -cloudabi* | -sortix* \
 	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
 	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-	      | -bitrig* | -openbsd* | -solidbsd* \
+	      | -hiux* | -knetbsd* | -mirbsd* | -netbsd* \
+	      | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \
 	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
 	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
 	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
-	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -chorusos* | -chorusrdb* | -cegcc* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* | -hcos* \
+	      | -chorusos* | -chorusrdb* | -cegcc* | -glidix* \
 	      | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
+	      | -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
 	      | -linux-newlib* | -linux-musl* | -linux-uclibc* \
 	      | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
-	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
+	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* \
 	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
 	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
-	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
+	      | -morphos* | -superux* | -rtmk* | -windiss* \
 	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -tirtos*)
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
+	      | -onefs* | -tirtos* | -phoenix* | -fuchsia* | -redox* | -bme* \
+	      | -midnightbsd*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@@ -1410,12 +1413,12 @@
 	-nto*)
 		os=`echo $os | sed -e 's|nto|nto-qnx|'`
 		;;
-	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
-	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
+	-sim | -xray | -os68k* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* \
 	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
 		;;
 	-mac*)
-		os=`echo $os | sed -e 's|mac|macos|'`
+		os=`echo "$os" | sed -e 's|mac|macos|'`
 		;;
 	-linux-dietlibc)
 		os=-linux-dietlibc
@@ -1424,10 +1427,10 @@
 		os=`echo $os | sed -e 's|linux|linux-gnu|'`
 		;;
 	-sunos5*)
-		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		os=`echo "$os" | sed -e 's|sunos5|solaris2|'`
 		;;
 	-sunos6*)
-		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		os=`echo "$os" | sed -e 's|sunos6|solaris3|'`
 		;;
 	-opened*)
 		os=-openedition
@@ -1438,12 +1441,6 @@
 	-wince*)
 		os=-wince
 		;;
-	-osfrose*)
-		os=-osfrose
-		;;
-	-osf*)
-		os=-osf
-		;;
 	-utek*)
 		os=-bsd
 		;;
@@ -1468,7 +1465,7 @@
 	-nova*)
 		os=-rtmk-nova
 		;;
-	-ns2 )
+	-ns2)
 		os=-nextstep2
 		;;
 	-nsk*)
@@ -1490,7 +1487,7 @@
 	-oss*)
 		os=-sysv3
 		;;
-	-svr4)
+	-svr4*)
 		os=-sysv4
 		;;
 	-svr3)
@@ -1505,32 +1502,38 @@
 	-ose*)
 		os=-ose
 		;;
-	-es1800*)
-		os=-ose
-		;;
-	-xenix)
-		os=-xenix
-		;;
 	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
 		os=-mint
 		;;
-	-aros*)
-		os=-aros
-		;;
 	-zvmoe)
 		os=-zvmoe
 		;;
 	-dicos*)
 		os=-dicos
 		;;
+	-pikeos*)
+		# Until real need of OS specific support for
+		# particular features comes up, bare metal
+		# configurations are quite functional.
+		case $basic_machine in
+		    arm*)
+			os=-eabi
+			;;
+		    *)
+			os=-elf
+			;;
+		esac
+		;;
 	-nacl*)
 		;;
+	-ios)
+		;;
 	-none)
 		;;
 	*)
 		# Get rid of the `-' at the beginning of $os.
 		os=`echo $os | sed 's/[^-]*-//'`
-		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		echo Invalid configuration \`"$1"\': system \`"$os"\' not recognized 1>&2
 		exit 1
 		;;
 esac
@@ -1620,12 +1623,12 @@
 	sparc-* | *-sun)
 		os=-sunos4.1.1
 		;;
+	pru-*)
+		os=-elf
+		;;
 	*-be)
 		os=-beos
 		;;
-	*-haiku)
-		os=-haiku
-		;;
 	*-ibm)
 		os=-aix
 		;;
@@ -1665,7 +1668,7 @@
 	m88k-omron*)
 		os=-luna
 		;;
-	*-next )
+	*-next)
 		os=-nextstep
 		;;
 	*-sequent)
@@ -1680,9 +1683,6 @@
 	i370-*)
 		os=-mvs
 		;;
-	*-next)
-		os=-nextstep3
-		;;
 	*-gould)
 		os=-sysv
 		;;
@@ -1792,15 +1792,15 @@
 				vendor=stratus
 				;;
 		esac
-		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		basic_machine=`echo "$basic_machine" | sed "s/unknown/$vendor/"`
 		;;
 esac
 
-echo $basic_machine$os
+echo "$basic_machine$os"
 exit
 
 # Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "timestamp='"
 # time-stamp-format: "%:y-%02m-%02d"
 # time-stamp-end: "'"
diff -Nru elpa-2016.05.001/configure elpa-2019.11.001/configure
--- elpa-2016.05.001/configure	2016-05-20 07:04:36.000000000 +0000
+++ elpa-2019.11.001/configure	2019-12-21 16:29:45.000000000 +0000
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for elpa 2016.05.001.
+# Generated by GNU Autoconf 2.69 for elpa 2019.11.001.
 #
 # Report bugs to <elpa-library@mpcdf.mpg.de>.
 #
@@ -198,6 +198,7 @@
   as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
   eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
   test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
+test \$(( 1 + 1 )) = 2 || exit 1
 
   test -n \"\${ZSH_VERSION+set}\${BASH_VERSION+set}\" || (
     ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
@@ -205,8 +206,7 @@
     ECHO=\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO
     PATH=/empty FPATH=/empty; export PATH FPATH
     test \"X\`printf %s \$ECHO\`\" = \"X\$ECHO\" \\
-      || test \"X\`print -r -- \$ECHO\`\" = \"X\$ECHO\" ) || exit 1
-test \$(( 1 + 1 )) = 2 || exit 1"
+      || test \"X\`print -r -- \$ECHO\`\" = \"X\$ECHO\" ) || exit 1"
   if (eval "$as_required") 2>/dev/null; then :
   as_have_required=yes
 else
@@ -590,12 +590,12 @@
 # Identity of this package.
 PACKAGE_NAME='elpa'
 PACKAGE_TARNAME='elpa'
-PACKAGE_VERSION='2016.05.001'
-PACKAGE_STRING='elpa 2016.05.001'
+PACKAGE_VERSION='2019.11.001'
+PACKAGE_STRING='elpa 2019.11.001'
 PACKAGE_BUGREPORT='elpa-library@mpcdf.mpg.de'
 PACKAGE_URL=''
 
-ac_unique_file="src/elpa1.F90"
+ac_unique_file="src/elpa.F90"
 # Factoring default headers for most tests.
 ac_includes_default="\
 #include <stdio.h>
@@ -636,13 +636,50 @@
 am__EXEEXT_TRUE
 LTLIBOBJS
 LIBOBJS
+pytest_found
+WITH_PYTHON_TESTS_FALSE
+WITH_PYTHON_TESTS_TRUE
+NUMPY_INCLUDE
+cython_found
+PYTHON_CONFIG
+PYTHON_INCLUDE
+pkgpyexecdir
+pyexecdir
+pkgpythondir
+pythondir
+PYTHON_PLATFORM
+PYTHON_EXEC_PREFIX
+PYTHON_PREFIX
+PYTHON_VERSION
+PYTHON
+WITH_PYTHON_FALSE
+WITH_PYTHON_TRUE
+ACTUAL_FC
 PKG_CONFIG_FILE
 SUFFIX
+STORE_BUILD_CONFIG_FALSE
+STORE_BUILD_CONFIG_TRUE
+xxd_CHECK
 DOXYGEN_OUTPUT_DIR
 OPENMP_LDFLAGS
-with_amd_bulldozer_kernel
 WITH_BLACS
 WITH_MKL
+HAVE_SKEWSYMMETRIC_FALSE
+HAVE_SKEWSYMMETRIC_TRUE
+WANT_SINGLE_PRECISION_COMPLEX_FALSE
+WANT_SINGLE_PRECISION_COMPLEX_TRUE
+WANT_SINGLE_PRECISION_REAL_FALSE
+WANT_SINGLE_PRECISION_REAL_TRUE
+BUILD_KCOMPUTER_FALSE
+BUILD_KCOMPUTER_TRUE
+ENABLE_C_TESTS_FALSE
+ENABLE_C_TESTS_TRUE
+ENABLE_AUTOTUNING_FALSE
+ENABLE_AUTOTUNING_TRUE
+USE_FORTRAN2008_FALSE
+USE_FORTRAN2008_TRUE
+WITH_USE_ASSUMED_SIZE_FALSE
+WITH_USE_ASSUMED_SIZE_TRUE
 DOXYGEN_PAPER_SIZE
 DX_COND_latex_FALSE
 DX_COND_latex_TRUE
@@ -689,7 +726,6 @@
 DX_DOCDIR
 DX_CONFIG
 DX_PROJECT
-CPP
 LT_SYS_LIBRARY_PATH
 OTOOL64
 OTOOL
@@ -706,83 +742,155 @@
 DUMPBIN
 LD
 FGREP
-EGREP
-GREP
 SED
 LIBTOOL
+ELPA_2STAGE_REAL_GPU_COMPILED
+ELPA_2STAGE_COMPLEX_GPU_COMPILED
+WITH_GPU_VERSION_FALSE
+WITH_GPU_VERSION_TRUE
+ELPA_2STAGE_COMPLEX_BGQ_COMPILED
+WITH_COMPLEX_BGQ_KERNEL_FALSE
+WITH_COMPLEX_BGQ_KERNEL_TRUE
+ELPA_2STAGE_REAL_BGQ_COMPILED
 WITH_REAL_BGQ_KERNEL_FALSE
 WITH_REAL_BGQ_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_BGP_COMPILED
+WITH_COMPLEX_BGP_KERNEL_FALSE
+WITH_COMPLEX_BGP_KERNEL_TRUE
+ELPA_2STAGE_REAL_BGP_COMPILED
 WITH_REAL_BGP_KERNEL_FALSE
 WITH_REAL_BGP_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED
+WITH_COMPLEX_AVX512_BLOCK2_KERNEL_FALSE
+WITH_COMPLEX_AVX512_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED
+WITH_COMPLEX_AVX512_BLOCK1_KERNEL_FALSE
+WITH_COMPLEX_AVX512_BLOCK1_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED
+WITH_REAL_AVX512_BLOCK6_KERNEL_FALSE
+WITH_REAL_AVX512_BLOCK6_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED
+WITH_REAL_AVX512_BLOCK4_KERNEL_FALSE
+WITH_REAL_AVX512_BLOCK4_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED
+WITH_REAL_AVX512_BLOCK2_KERNEL_FALSE
+WITH_REAL_AVX512_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED
 WITH_COMPLEX_AVX2_BLOCK2_KERNEL_FALSE
 WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_AVX2_BLOCK1_COMPILED
 WITH_COMPLEX_AVX2_BLOCK1_KERNEL_FALSE
 WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE
-WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE
-WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE
-WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE
-WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE
-WITH_COMPLEX_SSE_BLOCK2_KERNEL_FALSE
-WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE
-WITH_COMPLEX_SSE_BLOCK1_KERNEL_FALSE
-WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX2_BLOCK6_COMPILED
 WITH_REAL_AVX2_BLOCK6_KERNEL_FALSE
 WITH_REAL_AVX2_BLOCK6_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX2_BLOCK4_COMPILED
 WITH_REAL_AVX2_BLOCK4_KERNEL_FALSE
 WITH_REAL_AVX2_BLOCK4_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX2_BLOCK2_COMPILED
 WITH_REAL_AVX2_BLOCK2_KERNEL_FALSE
 WITH_REAL_AVX2_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_AVX_BLOCK2_COMPILED
+WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE
+WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_AVX_BLOCK1_COMPILED
+WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE
+WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX_BLOCK6_COMPILED
 WITH_REAL_AVX_BLOCK6_KERNEL_FALSE
 WITH_REAL_AVX_BLOCK6_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX_BLOCK4_COMPILED
 WITH_REAL_AVX_BLOCK4_KERNEL_FALSE
 WITH_REAL_AVX_BLOCK4_KERNEL_TRUE
+ELPA_2STAGE_REAL_AVX_BLOCK2_COMPILED
 WITH_REAL_AVX_BLOCK2_KERNEL_FALSE
 WITH_REAL_AVX_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY_COMPILED
+WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_FALSE
+WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE
+ELPA_2STAGE_REAL_SSE_ASSEMBLY_COMPILED
+WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE
+WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_SSE_BLOCK2_COMPILED
+WITH_COMPLEX_SSE_BLOCK2_KERNEL_FALSE
+WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_SSE_BLOCK1_COMPILED
+WITH_COMPLEX_SSE_BLOCK1_KERNEL_FALSE
+WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE
+ELPA_2STAGE_REAL_SSE_BLOCK6_COMPILED
 WITH_REAL_SSE_BLOCK6_KERNEL_FALSE
 WITH_REAL_SSE_BLOCK6_KERNEL_TRUE
+ELPA_2STAGE_REAL_SSE_BLOCK4_COMPILED
 WITH_REAL_SSE_BLOCK4_KERNEL_FALSE
 WITH_REAL_SSE_BLOCK4_KERNEL_TRUE
+ELPA_2STAGE_REAL_SSE_BLOCK2_COMPILED
 WITH_REAL_SSE_BLOCK2_KERNEL_FALSE
 WITH_REAL_SSE_BLOCK2_KERNEL_TRUE
-WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_FALSE
-WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE
-WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE
-WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE
+ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED
+WITH_REAL_VSX_BLOCK6_KERNEL_FALSE
+WITH_REAL_VSX_BLOCK6_KERNEL_TRUE
+ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED
+WITH_REAL_VSX_BLOCK4_KERNEL_FALSE
+WITH_REAL_VSX_BLOCK4_KERNEL_TRUE
+ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED
+WITH_REAL_VSX_BLOCK2_KERNEL_FALSE
+WITH_REAL_VSX_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED
+WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_FALSE
+WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_TRUE
+ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED
+WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_FALSE
+WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_TRUE
+ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED
+WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_FALSE
+WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED
+WITH_REAL_SPARC64_BLOCK6_KERNEL_FALSE
+WITH_REAL_SPARC64_BLOCK6_KERNEL_TRUE
+ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED
+WITH_REAL_SPARC64_BLOCK4_KERNEL_FALSE
+WITH_REAL_SPARC64_BLOCK4_KERNEL_TRUE
+ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED
+WITH_REAL_SPARC64_BLOCK2_KERNEL_FALSE
+WITH_REAL_SPARC64_BLOCK2_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE_COMPILED
 WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_FALSE
 WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE
-WITH_REAL_GENERIC_SIMPLE_KERNEL_FALSE
-WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE
+ELPA_2STAGE_COMPLEX_GENERIC_COMPILED
 WITH_COMPLEX_GENERIC_KERNEL_FALSE
 WITH_COMPLEX_GENERIC_KERNEL_TRUE
+ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED
+WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_FALSE
+WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_TRUE
+ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED
+WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_FALSE
+WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_TRUE
+ELPA_2STAGE_REAL_GENERIC_SIMPLE_COMPILED
+WITH_REAL_GENERIC_SIMPLE_KERNEL_FALSE
+WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE
+ELPA_2STAGE_REAL_GENERIC_COMPILED
 WITH_REAL_GENERIC_KERNEL_FALSE
 WITH_REAL_GENERIC_KERNEL_TRUE
+nvcc_found
+NVCCFLAGS
+NVCC
+ELPA_2STAGE_COMPLEX_DEFAULT
+ELPA_2STAGE_REAL_DEFAULT
+BAND_TO_FULL_BLOCKING_FALSE
+BAND_TO_FULL_BLOCKING_TRUE
 SCALAPACK_FCFLAGS
 SCALAPACK_LDFLAGS
-FCLIBS
-host_os
-host_vendor
-host_cpu
-host
-build_os
-build_vendor
-build_cpu
-build
-FC_MODOUT
-ac_empty
-FC_MODINC
-HAVE_AVX2_FALSE
-HAVE_AVX2_TRUE
-HAVE_AVX_FALSE
-HAVE_AVX_TRUE
-HAVE_SSE_INTRINSICS_FALSE
-HAVE_SSE_INTRINSICS_TRUE
-HAVE_SSE_ASSEMBLY_FALSE
-HAVE_SSE_ASSEMBLY_TRUE
 HAVE_DETAILED_TIMINGS_FALSE
 HAVE_DETAILED_TIMINGS_TRUE
 HAVE_REDIRECT_FALSE
 HAVE_REDIRECT_TRUE
+MPI_BINARY
 OPENMP_FCFLAGS
+FCLIBS
+FC_MODOUT
+ac_empty
+FC_MODINC
+FCFLAGS_F90
 FCFLAGS
 ac_ct_FC
 FC
@@ -793,6 +901,25 @@
 CCAS
 ac_ct_AR
 AR
+HAVE_64BIT_INTEGER_MPI_SUPPORT_FALSE
+HAVE_64BIT_INTEGER_MPI_SUPPORT_TRUE
+HAVE_64BIT_INTEGER_MATH_SUPPORT_FALSE
+HAVE_64BIT_INTEGER_MATH_SUPPORT_TRUE
+EGREP
+GREP
+CPP
+HAVE_HETEROGENOUS_CLUSTER_SUPPORT_FALSE
+HAVE_HETEROGENOUS_CLUSTER_SUPPORT_TRUE
+CPUEXT_FLAGS
+SIMD_FLAGS
+host_os
+host_vendor
+host_cpu
+host
+build_os
+build_vendor
+build_cpu
+build
 OPENMP_CFLAGS
 am__fastdepCC_FALSE
 am__fastdepCC_TRUE
@@ -801,7 +928,6 @@
 AMDEPBACKSLASH
 AMDEP_FALSE
 AMDEP_TRUE
-am__quote
 am__include
 DEPDIR
 OBJEXT
@@ -811,16 +937,19 @@
 CFLAGS
 ac_ct_CC
 CC
+WITH_SCALAPACK_TESTS_FALSE
+WITH_SCALAPACK_TESTS_TRUE
 WITH_MPI_FALSE
 WITH_MPI_TRUE
 WITH_OPENMP_FALSE
 WITH_OPENMP_TRUE
 FORTRAN_MODULE_DEPS
+OPTIONAL_C_ERROR_ARGUMENT_FALSE
+OPTIONAL_C_ERROR_ARGUMENT_TRUE
 ifGNUmake
+CURRENT_AUTOTUNE_VERSION
+CURRENT_API_VERSION
 ELPA_SO_VERSION
-MAINT
-MAINTAINER_MODE_FALSE
-MAINTAINER_MODE_TRUE
 AM_BACKSLASH
 AM_DEFAULT_VERBOSITY
 AM_DEFAULT_V
@@ -885,39 +1014,48 @@
 PACKAGE_TARNAME
 PACKAGE_NAME
 PATH_SEPARATOR
-SHELL'
+SHELL
+am__quote'
 ac_subst_files=''
 ac_user_opts='
 enable_option_checking
 enable_silent_rules
-enable_maintainer_mode
+enable_optional_argument_in_C_API
 enable_openmp
 with_mpi
+enable_scalapack_tests
 enable_dependency_tracking
-with_ftimings
-with_redirect
-enable_papi
-with_avx_optimization
-with_real_generic_kernel_only
-with_real_generic_simple_kernel_only
-with_real_sse_assembly_kernel_only
-with_real_bgp_kernel_only
-with_real_bgq_kernel_only
-with_real_sse_block2_kernel_only
-with_real_sse_block4_kernel_only
-with_real_sse_block6_kernel_only
-with_real_avx_block2_kernel_only
-with_real_avx_block4_kernel_only
-with_real_avx_block6_kernel_only
-with_complex_generic_kernel_only
-with_complex_generic_simple_kernel_only
-with_complex_sse_assembly_kernel_only
-with_complex_bgp_kernel_only
-with_complex_bgq_kernel_only
-with_complex_sse_block1_kernel_only
-with_complex_sse_block2_kernel_only
-with_complex_avx_block1_kernel_only
-with_complex_avx_block2_kernel_only
+enable_heterogenous_cluster_support
+enable_64bit_integer_math_support
+enable_64bit_integer_mpi_support
+enable_redirect
+enable_single_precision
+enable_timings
+with_papi
+with_likwid
+enable_band_to_full_blocking
+with_cuda_path
+with_cuda_sdk_path
+with_GPU_compute_capability
+enable_mpi_module
+enable_generic
+enable_sparc64
+enable_neon_arch64
+enable_vsx
+enable_sse
+enable_sse_assembly
+enable_avx
+enable_avx2
+enable_avx512
+enable_bgp
+enable_bgq
+with_fixed_real_kernel
+with_fixed_complex_kernel
+with_default_real_kernel
+with_default_complex_kernel
+enable_gpu
+enable_gpu_memory_debug
+enable_nvtx
 enable_shared
 enable_static
 with_pic
@@ -936,6 +1074,16 @@
 enable_doxygen_html
 enable_doxygen_ps
 enable_doxygen_pdf
+enable_assumed_size
+enable_Fortran2008_features
+enable_autotuning
+enable_c_tests
+enable_K_computer
+enable_SX_Aurora
+enable_skew_symmetric_support
+enable_store_build_config
+enable_python
+enable_python_tests
 '
       ac_precious_vars='build_alias
 host_alias
@@ -945,6 +1093,7 @@
 LDFLAGS
 LIBS
 CPPFLAGS
+CPP
 CCAS
 CCASFLAGS
 FC
@@ -952,8 +1101,11 @@
 SCALAPACK_LDFLAGS
 SCALAPACK_FCFLAGS
 LT_SYS_LIBRARY_PATH
-CPP
-DOXYGEN_PAPER_SIZE'
+DOXYGEN_PAPER_SIZE
+PYTHON
+PYTHON_INCLUDE
+PYTHON_CONFIG
+NUMPY_INCLUDE'
 
 
 # Initialize some variables set by options.
@@ -1494,7 +1646,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures elpa 2016.05.001 to adapt to many kinds of systems.
+\`configure' configures elpa 2019.11.001 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1564,7 +1716,7 @@
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of elpa 2016.05.001:";;
+     short | recursive ) echo "Configuration of elpa 2019.11.001:";;
    esac
   cat <<\_ACEOF
 
@@ -1574,16 +1726,57 @@
   --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
   --enable-silent-rules   less verbose build output (undo: "make V=1")
   --disable-silent-rules  verbose build output (undo: "make V=0")
-  --enable-maintainer-mode
-                          enable make rules and dependencies not useful (and
-                          sometimes confusing) to the casual installer
+  --enable-optional-argument-in-C-API
+                          do not build C API with error argument as optional,
+                          default no
   --enable-openmp         use OpenMP threading, default no.
+  --enable-scalapack-tests
+                          build SCALAPACK test cases for performance
+                          comparison, needs MPI, default no.
   --enable-dependency-tracking
                           do not reject slow dependency extractors
   --disable-dependency-tracking
                           speeds up one-time build
-  --disable-papi          Do not use PAPI to also measure flop count,
-                          autodetected by default
+  --heterogenous-cluster-support
+                          allow ELPA to automatically switch to a kernel
+                          supported on all CPUs of a cluster (only works for
+                          Intel CPUs at the moment), default no. Activate only
+                          if necessary has a performance penalty! This feature
+                          is exerpimental!
+  --64bit-integer-math-support
+                          allows to link against the 64bit integer versions of
+                          the math libraries BLAS, LAPACK, and SCALAPACK
+  --64bit-integer-mpi-support
+                          allows to link against the 64bit integer versions of
+                          the MPI library
+  --enable-redirect       for test programs, allow redirection of
+                          stdout/stderr per MPI taks in a file (useful for
+                          timing), default no.
+  --enable-single-precision
+                          build with single precision
+  --disable-timings       more detailed timing, default yes
+  --disable-band-to-full-blocking
+                          build ELPA2 with blocking in band_to_full (default:
+                          enabled)
+  --disable-mpi-module    do not use the Fortran MPI module, get interfaces by
+                          'include "mpif.h'
+  --disable-generic       do not build GENERIC kernels, default: enabled
+  --enable-sparc64        build SPARC64 kernels, default: disabled
+  --enable-neon-arch64    build NEON_ARCH64 kernels, default: disabled
+  --enable-vsx            build VSX kernels, default: disabled
+  --disable-sse           do not build SSE kernels, default: enabled
+  --disable-sse-assembly  do not build SSE_ASSEMBLY kernels, default: enabled
+  --disable-avx           do not build AVX kernels, default: enabled
+  --disable-avx2          do not build AVX2 kernels, default: enabled
+  --disable-avx512        do not build AVX512 kernels, default: enabled
+  --enable-bgp            build BGP kernels, default: disabled
+  --enable-bgq            build BGQ kernels, default: disabled
+  --enable-gpu            do use GPU version
+  --enable-gpu-memory-debug
+                          Output information on GPU memory to be processed by
+                          utils/memory/check_memory.py
+  --enable-nvtx           build and install nvtx wrapper for profiling th GPU
+                          version, default no.
   --enable-shared[=PKGS]  build shared libraries [default=yes]
   --enable-static[=PKGS]  build static libraries [default=yes]
   --enable-fast-install[=PKGS]
@@ -1600,60 +1793,71 @@
   --disable-doxygen-html  don't generate doxygen plain HTML documentation
   --enable-doxygen-ps     generate doxygen PostScript documentation
   --enable-doxygen-pdf    generate doxygen PDF documentation
+  --disable-assumed-size  do NOT use assumed-size Fortran arrays
+  --enable-Fortran2008-features
+                          enables some Fortran 2008 features, default yes.
+  --enable-autotuning     enables autotuning functionality, default yes.
+  --enable-c-tests        enables the C tests for elpa, default yes.
+  --enable-K-computer     enable builds on K-Computer, default no.
+  --enable-SX-Aurora      enable builds on SX-Aurora, default no.
+  --enable-skew-symmetric-support
+                          enable support for real valued skew-symmetric
+                          matrices
+  --enable-store-build-config
+                          compile build config into the library object,
+                          default no
+  --enable-python         build and install python wrapper, default no.
+  --enable-python-tests   enable python tests, default no.
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
   --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
   --with-mpi=[yes|no]     compile with MPI. Default: yes
-  --with-ftimings         detailed timings, default no.
-  --with-redirect         for test programs, allow redirection of
-                          stdout/stderr per MPI taks in a file (useful for
-                          ftimings), default no.
-  --with-avx-optimization use AVX optimization, default no.
-  --with-real-generic-kernel-only
-                          only compile generic-kernel for real case
-  --with-real-generic-simple-kernel-only
-                          only compile generic-simple-kernel for real case
-  --with-real-sse-assembly-kernel-only
-                          only compile sse-assembly-kernel for real case
-  --with-real-bgp-kernel-only
-                          only compile bgp-kernel for real case
-  --with-real-bgq-kernel-only
-                          only compile bgq-kernel for real case
-  --with-real-sse-block2-kernel-only
-                          only compile real-sse-block2-kernel for real case
-  --with-real-sse-block4-kernel-only
-                          only compile real-sse-block4-kernel for real case
-  --with-real-sse-block6-kernel-only
-                          only compile real-sse-block6-kernel for real case
-  --with-real-avx-block2-kernel-only
-                          only compile real-avx-block2-kernel for real case
-  --with-real-avx-block4-kernel-only
-                          only compile real-avx-block4-kernel for real case
-  --with-real-avx-block6-kernel-only
-                          only compile real-avx-block6-kernel for real case
-  --with-complex-generic-kernel-only
-                          only compile generic-kernel for complex case
-  --with-complex-generic-simple-kernel-only
-                          only compile generic-simple-kernel for complex case
-  --with-complex-sse-assembly-kernel-only
-                          only compile sse-assembly-kernel for complex case
-  --with-complex-bgp-kernel-only
-                          only compile bgp-kernel for complex case
-  --with-complex-bgq-kernel-only
-                          only compile bgq-kernel for complex case
-  --with-complex-sse-block1-kernel-only
-                          only compile complex-sse-block1-kernel for complex
-                          case
-  --with-complex-sse-block2-kernel-only
-                          only compile complex-sse-block2-kernel for complex
-                          case
-  --with-complex-avx-block1-kernel-only
-                          only compile complex-avx-block1-kernel for complex
-                          case
-  --with-complex-avx-block2-kernel-only
-                          only compile complex-avx-block2-kernel for complex
-                          case
+  --with-papi             Use PAPI to also measure flop count in the detailed
+                          timing (--enable-timing), disabled by default
+  --with-likwid=[yes|no(default)|PATH]
+                          Use Likwid to measure performance in some parts of
+                          the library
+  --with-cuda-path=PATH   prefix where CUDA is installed [default=auto]
+  --with-cuda-sdk-path=PATH
+                          prefix where CUDA SDK is installed [default=auto]
+  --with-GPU-compute-capability=VALUE
+                          use compute capability VALUE for GPU version,
+                          default: "sm_35"
+  --with-fixed-real-kernel=KERNEL
+                          compile with only a single specific real kernel.
+                          Available kernels are: generic generic_simple
+                          generic_simple_block4 generic_simple_block6
+                          sparc64_block2 sparc64_block4 sparc64_block6
+                          neon_arch64_block2 neon_arch64_block4
+                          neon_arch64_block6 vsx_block2 vsx_block4 vsx_block6
+                          sse_block2 sse_block4 sse_block6 sse_assembly
+                          avx_block2 avx_block4 avx_block6 avx2_block2
+                          avx2_block4 avx2_block6 avx512_block2 avx512_block4
+                          avx512_block6 bgp bgq
+  --with-fixed-complex-kernel=KERNEL
+                          compile with only a single specific complex kernel.
+                          Available kernels are: generic generic_simple
+                          sse_block1 sse_block2 sse_assembly avx_block1
+                          avx_block2 avx2_block1 avx2_block2 avx512_block1
+                          avx512_block2 bgp bgq
+  --with-default-real-kernel=KERNEL
+                          set a specific real kernel as default kernel.
+                          Available kernels are: generic generic_simple
+                          generic_simple_block4 generic_simple_block6
+                          sparc64_block2 sparc64_block4 sparc64_block6
+                          neon_arch64_block2 neon_arch64_block4
+                          neon_arch64_block6 vsx_block2 vsx_block4 vsx_block6
+                          sse_block2 sse_block4 sse_block6 sse_assembly
+                          avx_block2 avx_block4 avx_block6 avx2_block2
+                          avx2_block4 avx2_block6 avx512_block2 avx512_block4
+                          avx512_block6 bgp bgq
+  --with-default-complex-kernel=KERNEL
+                          set a specific complex kernel as default kernel.
+                          Available kernels are: generic generic_simple
+                          sse_block1 sse_block2 sse_assembly avx_block1
+                          avx_block2 avx2_block1 avx2_block2 avx512_block1
+                          avx512_block2 bgp bgq
   --with-pic[=PKGS]       try to use only PIC/non-PIC objects [default=use
                           both]
   --with-aix-soname=aix|svr4|both
@@ -1671,6 +1875,7 @@
   LIBS        libraries to pass to the linker, e.g. -l<library>
   CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
               you have headers in a nonstandard directory <include dir>
+  CPP         C preprocessor
   CCAS        assembler compiler command (defaults to CC)
   CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
   FC          Fortran compiler command
@@ -1682,9 +1887,15 @@
               Scalapack
   LT_SYS_LIBRARY_PATH
               User-defined run-time library search path.
-  CPP         C preprocessor
   DOXYGEN_PAPER_SIZE
               a4wide (default), a4, letter, legal or executive
+  PYTHON      the Python interpreter
+  PYTHON_INCLUDE
+              Include flags for python, bypassing python-config
+  PYTHON_CONFIG
+              Path to python-config
+  NUMPY_INCLUDE
+              Include flags for numpy
 
 Use these variables to override the choices made by `configure' or to help
 it to find libraries and programs with nonstandard names/locations.
@@ -1752,7 +1963,7 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-elpa configure 2016.05.001
+elpa configure 2019.11.001
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1850,120 +2061,230 @@
 
 } # ac_fn_c_try_link
 
-# ac_fn_fc_try_compile LINENO
-# ---------------------------
-# Try to compile conftest.$ac_ext, and return whether this succeeded.
-ac_fn_fc_try_compile ()
+# ac_fn_c_try_run LINENO
+# ----------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_c_try_run ()
 {
   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext
-  if { { ac_try="$ac_compile"
+  if { { ac_try="$ac_link"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
 $as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compile") 2>conftest.err
+  (eval "$ac_link") 2>&5
   ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_fc_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_fc_try_compile
-
-# ac_fn_fc_try_link LINENO
-# ------------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded.
-ac_fn_fc_try_link ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext conftest$ac_exeext
-  if { { ac_try="$ac_link"
-case "(($ac_try" in
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
 $as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>conftest.err
+  (eval "$ac_try") 2>&5
   ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_fc_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest$ac_exeext && {
-	 test "$cross_compiling" = yes ||
-	 test -x conftest$ac_exeext
-       }; then :
+  test $ac_status = 0; }; }; then :
   ac_retval=0
 else
-  $as_echo "$as_me: failed program was:" >&5
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+       $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	ac_retval=1
+       ac_retval=$ac_status
 fi
-  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
-  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
-  # interfere with the next link command; also delete a directory that is
-  # left behind by Apple's compiler.  We do this before executing the actions.
   rm -rf conftest.dSYM conftest_ipa8_conftest.oo
   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
   as_fn_set_status $ac_retval
 
-} # ac_fn_fc_try_link
+} # ac_fn_c_try_run
 
-# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
-# -------------------------------------------------------
-# Tests whether HEADER exists and can be compiled using the include files in
-# INCLUDES, setting the cache variable VAR accordingly.
-ac_fn_c_check_header_compile ()
+# ac_fn_c_compute_int LINENO EXPR VAR INCLUDES
+# --------------------------------------------
+# Tries to find the compile-time value of EXPR in a program that includes
+# INCLUDES, setting VAR accordingly. Returns whether the value could be
+# computed
+ac_fn_c_compute_int ()
 {
   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
+  if test "$cross_compiling" = yes; then
+    # Depending upon the size, compute the lo and hi bounds.
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) >= 0)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_lo=0 ac_mid=0
+  while :; do
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) <= $ac_mid)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_hi=$ac_mid; break
+else
+  as_fn_arith $ac_mid + 1 && ac_lo=$as_val
+			if test $ac_lo -le $ac_mid; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			as_fn_arith 2 '*' $ac_mid + 1 && ac_mid=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
 else
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 $4
-#include <$2>
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) < 0)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
 _ACEOF
 if ac_fn_c_try_compile "$LINENO"; then :
-  eval "$3=yes"
+  ac_hi=-1 ac_mid=-1
+  while :; do
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) >= $ac_mid)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_lo=$ac_mid; break
 else
-  eval "$3=no"
+  as_fn_arith '(' $ac_mid ')' - 1 && ac_hi=$as_val
+			if test $ac_mid -le $ac_hi; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			as_fn_arith 2 '*' $ac_mid && ac_mid=$as_val
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  ac_lo= ac_hi=
 fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+# Binary search between lo and hi bounds.
+while test "x$ac_lo" != "x$ac_hi"; do
+  as_fn_arith '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo && ac_mid=$as_val
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) <= $ac_mid)];
+test_array [0] = 0;
+return test_array [0];
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_hi=$ac_mid
+else
+  as_fn_arith '(' $ac_mid ')' + 1 && ac_lo=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+case $ac_lo in #((
+?*) eval "$3=\$ac_lo"; ac_retval=0 ;;
+'') ac_retval=1 ;;
+esac
+  else
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+static long int longval () { return $2; }
+static unsigned long int ulongval () { return $2; }
+#include <stdio.h>
+#include <stdlib.h>
+int
+main ()
+{
+
+  FILE *f = fopen ("conftest.val", "w");
+  if (! f)
+    return 1;
+  if (($2) < 0)
+    {
+      long int i = longval ();
+      if (i != ($2))
+	return 1;
+      fprintf (f, "%ld", i);
+    }
+  else
+    {
+      unsigned long int i = ulongval ();
+      if (i != ($2))
+	return 1;
+      fprintf (f, "%lu", i);
+    }
+  /* Do not output a trailing newline, as this causes \r\n confusion
+     on some platforms.  */
+  return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  echo >>conftest.val; read $3 <conftest.val; ac_retval=0
+else
+  ac_retval=1
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+rm -f conftest.val
+
+  fi
   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
 
-} # ac_fn_c_check_header_compile
+} # ac_fn_c_compute_int
 
 # ac_fn_c_try_cpp LINENO
 # ----------------------
@@ -2002,47 +2323,120 @@
 
 } # ac_fn_c_try_cpp
 
-# ac_fn_c_try_run LINENO
-# ----------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
-# that executables *can* be run.
-ac_fn_c_try_run ()
-{
+# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists and can be compiled using the include files in
+# INCLUDES, setting the cache variable VAR accordingly.
+ac_fn_c_check_header_compile ()
+{
   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_link"
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_compile
+
+# ac_fn_fc_try_compile LINENO
+# ---------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_fc_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
 $as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>&5
+  (eval "$ac_compile") 2>conftest.err
   ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
-  { { case "(($ac_try" in
+  test $ac_status = 0; } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_fc_try_compile
+
+# ac_fn_fc_try_link LINENO
+# ------------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded.
+ac_fn_fc_try_link ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext conftest$ac_exeext
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
 $as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
+  (eval "$ac_link") 2>conftest.err
   ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
+  test $ac_status = 0; } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 test -x conftest$ac_exeext
+       }; then :
   ac_retval=0
 else
-  $as_echo "$as_me: program exited with status $ac_status" >&5
-       $as_echo "$as_me: failed program was:" >&5
+  $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-       ac_retval=$ac_status
+	ac_retval=1
 fi
+  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
+  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
+  # interfere with the next link command; also delete a directory that is
+  # left behind by Apple's compiler.  We do this before executing the actions.
   rm -rf conftest.dSYM conftest_ipa8_conftest.oo
   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
   as_fn_set_status $ac_retval
 
-} # ac_fn_c_try_run
+} # ac_fn_fc_try_link
 
 # ac_fn_c_check_func LINENO FUNC VAR
 # ----------------------------------
@@ -2114,7 +2508,7 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by elpa $as_me 2016.05.001, which was
+It was created by elpa $as_me 2019.11.001, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2466,8 +2860,7 @@
 
 
 
-
-am__api_version='1.15'
+am__api_version='1.16'
 
 ac_aux_dir=
 for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
@@ -2982,7 +3375,7 @@
 
 # Define the identity of the package.
  PACKAGE='elpa'
- VERSION='2016.05.001'
+ VERSION='2019.11.001'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -3012,8 +3405,8 @@
 
 # For better backward compatibility.  To be removed once Automake 1.9.x
 # dies out for good.  For more background, see:
-# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
-# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
+# <https://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
+# <https://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 mkdir_p='$(MKDIR_P)'
 
 # We need awk for the "check" target (and possibly the TAP driver).  The
@@ -3064,7 +3457,7 @@
 Aborting the configuration process, to ensure you take notice of the issue.
 
 You can download and install GNU coreutils to get an 'rm' implementation
-that behaves properly: <http://www.gnu.org/software/coreutils/>.
+that behaves properly: <https://www.gnu.org/software/coreutils/>.
 
 If you want to complete the configuration process using your problematic
 'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
@@ -3076,36 +3469,6 @@
 fi
 
 
-# Without this, automake tries to be smart and rebuilt
-# the autoconf generated files such as configure, aclocal.m4, etc.,
-# in case the timestamps of files such as configure.ac are newer
-#
-# This only makes trouble for end users with out-of-date autoconf versions
-# that cannot produce these files
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5
-$as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; }
-    # Check whether --enable-maintainer-mode was given.
-if test "${enable_maintainer_mode+set}" = set; then :
-  enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval
-else
-  USE_MAINTAINER_MODE=no
-fi
-
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_MAINTAINER_MODE" >&5
-$as_echo "$USE_MAINTAINER_MODE" >&6; }
-   if test $USE_MAINTAINER_MODE = yes; then
-  MAINTAINER_MODE_TRUE=
-  MAINTAINER_MODE_FALSE='#'
-else
-  MAINTAINER_MODE_TRUE='#'
-  MAINTAINER_MODE_FALSE=
-fi
-
-  MAINT=$MAINTAINER_MODE_TRUE
-
-
-
 
 ac_config_headers="$ac_config_headers config.h"
 
@@ -3149,6 +3512,8 @@
 AM_BACKSLASH='\'
 
 
+# ABI version
+#
 # Set the libtool library version, see LIBRARY_INTERFACE
 #
 # See http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html
@@ -3162,9 +3527,38 @@
 #    by the current interface, as they are ABI compatible (e.g. only new symbols
 #    were added by the new interface)
 #
-ELPA_SO_VERSION=5:0:1
+ELPA_SO_VERSION=15:0:0
+
+
+# AC_DEFINE_SUBST(NAME, VALUE, DESCRIPTION)
+# -----------------------------------------
+
+
+# API Version
+
+$as_echo "#define EARLIEST_API_VERSION 20170403" >>confdefs.h
+
+
+
+
+$as_echo "#define CURRENT_API_VERSION 20191110" >>confdefs.h
+
+CURRENT_API_VERSION='20191110'
+
+
+# Autotune Version
+
+$as_echo "#define EARLIEST_AUTOTUNE_VERSION 20171201" >>confdefs.h
+
+
+$as_echo "#define CURRENT_AUTOTUNE_VERSION 20190524" >>confdefs.h
+
+
+
+$as_echo "#define CURRENT_AUTOTUNE_VERSION 20190524" >>confdefs.h
+
+CURRENT_AUTOTUNE_VERSION='20190524'
 
-#
 
 
  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU make" >&5
@@ -3197,10 +3591,39 @@
         as_fn_error $? "Need GNU Make" "$LINENO" 5
 fi
 
-#AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
-#if test x"${CPP_FOUND}" = xno; then
-#  AC_MSG_ERROR([no cpp found])
-#fi
+enable_legacy=no
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether in C interface the error argument should be optional" >&5
+$as_echo_n "checking whether in C interface the error argument should be optional... " >&6; }
+# Check whether --enable-optional-argument-in-C-API was given.
+if test "${enable_optional_argument_in_C_API+set}" = set; then :
+  enableval=$enable_optional_argument_in_C_API;
+	       if test x"$enableval" = x"yes"; then
+	         optional_c_error_argument=yes
+	       else
+	         optional_c_error_argument=no
+	       fi
+
+else
+  optional_c_error_argument=no
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${optional_c_error_argument}" >&5
+$as_echo "${optional_c_error_argument}" >&6; }
+ if test x"$optional_c_error_argument" = x"yes"; then
+  OPTIONAL_C_ERROR_ARGUMENT_TRUE=
+  OPTIONAL_C_ERROR_ARGUMENT_FALSE='#'
+else
+  OPTIONAL_C_ERROR_ARGUMENT_TRUE='#'
+  OPTIONAL_C_ERROR_ARGUMENT_FALSE=
+fi
+
+if test x"${optional_c_error_argument}" = x"yes"; then
+
+$as_echo "#define OPTIONAL_C_ERROR_ARGUMENT 1" >>confdefs.h
+
+fi
+
 
 # gnu-make fortran module dependencies
 
@@ -3228,7 +3651,6 @@
 
 
 
-###
 
 # openmp.m4 serial 4
 
@@ -3268,12 +3690,17 @@
 # an output file called 'penmp' rather than activating OpenMP support.
 
 
-
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether --enable-openmp is specified" >&5
 $as_echo_n "checking whether --enable-openmp is specified... " >&6; }
 # Check whether --enable-openmp was given.
 if test "${enable_openmp+set}" = set; then :
   enableval=$enable_openmp;
+	       if test x"$enableval" = x"yes"; then
+	         enable_openmp=yes
+	       else
+	         enable_openmp=no
+	       fi
+
 else
   enable_openmp=no
 fi
@@ -3295,6 +3722,7 @@
 fi
 
 
+
 # Check whether --with-mpi was given.
 if test "${with_mpi+set}" = set; then :
   withval=$with_mpi;
@@ -3302,7 +3730,7 @@
   with_mpi=yes
 fi
 
- if test x"with_mpi" = x"yes"; then
+ if test x"$with_mpi" = x"yes"; then
   WITH_MPI_TRUE=
   WITH_MPI_FALSE='#'
 else
@@ -3316,56 +3744,92 @@
 
 fi
 
-# C
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether --enable-scalapack-tests is specified" >&5
+$as_echo_n "checking whether --enable-scalapack-tests is specified... " >&6; }
+# Check whether --enable-scalapack-tests was given.
+if test "${enable_scalapack_tests+set}" = set; then :
+  enableval=$enable_scalapack_tests;
+	       if test x"$enableval" = x"yes"; then
+	         enable_scalapack_tests=yes
+	       else
+	         enable_scalapack_tests=no
+	       fi
+
+else
+  enable_scalapack_tests="no"
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $enable_scalapack_tests" >&5
+$as_echo "$enable_scalapack_tests" >&6; }
+if test x"${enable_scalapack_tests}" = x"yes"; then
+        if test x"$with_mpi" = x"no"; then
+                as_fn_error $? "You cannot build the SCALAPCK test cases without MPI" "$LINENO" 5
+        fi
+
+$as_echo "#define WITH_SCALAPACK_TESTS 1" >>confdefs.h
+
+fi
+ if test x"$enable_scalapack_tests" = x"yes"; then
+  WITH_SCALAPACK_TESTS_TRUE=
+  WITH_SCALAPACK_TESTS_FALSE='#'
+else
+  WITH_SCALAPACK_TESTS_TRUE='#'
+  WITH_SCALAPACK_TESTS_FALSE=
+fi
+
+
+
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 DEPDIR="${am__leading_dot}deps"
 
 ac_config_commands="$ac_config_commands depfiles"
 
-
-am_make=${MAKE-make}
-cat > confinc << 'END'
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} supports the include directive" >&5
+$as_echo_n "checking whether ${MAKE-make} supports the include directive... " >&6; }
+cat > confinc.mk << 'END'
 am__doit:
-	@echo this is the am__doit target
+	@echo this is the am__doit target >confinc.out
 .PHONY: am__doit
 END
-# If we don't find an include directive, just comment out the code.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5
-$as_echo_n "checking for style of include used by $am_make... " >&6; }
 am__include="#"
 am__quote=
-_am_result=none
-# First try GNU make style include.
-echo "include confinc" > confmf
-# Ignore all kinds of additional output from 'make'.
-case `$am_make -s -f confmf 2> /dev/null` in #(
-*the\ am__doit\ target*)
-  am__include=include
-  am__quote=
-  _am_result=GNU
-  ;;
-esac
-# Now try BSD make style include.
-if test "$am__include" = "#"; then
-   echo '.include "confinc"' > confmf
-   case `$am_make -s -f confmf 2> /dev/null` in #(
-   *the\ am__doit\ target*)
-     am__include=.include
-     am__quote="\""
-     _am_result=BSD
+# BSD make does it like this.
+echo '.include "confinc.mk" # ignored' > confmf.BSD
+# Other make implementations (GNU, Solaris 10, AIX) do it like this.
+echo 'include confinc.mk # ignored' > confmf.GNU
+_am_result=no
+for s in GNU BSD; do
+  { echo "$as_me:$LINENO: ${MAKE-make} -f confmf.$s && cat confinc.out" >&5
+   (${MAKE-make} -f confmf.$s && cat confinc.out) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+  case $?:`cat confinc.out 2>/dev/null` in #(
+  '0:this is the am__doit target') :
+    case $s in #(
+  BSD) :
+    am__include='.include' am__quote='"' ;; #(
+  *) :
+    am__include='include' am__quote='' ;;
+esac ;; #(
+  *) :
      ;;
-   esac
-fi
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5
-$as_echo "$_am_result" >&6; }
-rm -f confinc confmf
+esac
+  if test "$am__include" != "#"; then
+    _am_result="yes ($s style)"
+    break
+  fi
+done
+rm -f confinc.* confmf.*
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${_am_result}" >&5
+$as_echo "${_am_result}" >&6; }
 
 # Check whether --enable-dependency-tracking was given.
 if test "${enable_dependency_tracking+set}" = set; then :
@@ -4624,7 +5088,7 @@
   ac_cv_prog_c_openmp='none needed'
 else
   ac_cv_prog_c_openmp='unsupported'
-	  	  	  	  	  	  	  	  	  	  	  	  	  for ac_option in -openmp -fopenmp -xopenmp -mp -omp -qsmp=omp; do
+	  	  	  	  	  	  	  	  	  	  	  	  	  	  for ac_option in -fopenmp -qopenmp -xopenmp -mp -omp -qsmp=omp -openmp; do
 	    ac_save_CFLAGS=$CFLAGS
 	    CFLAGS="$CFLAGS $ac_option"
 	    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -4668,1319 +5132,2757 @@
   CFLAGS="$OPENMP_CFLAGS $CFLAGS"
 fi
 
-
-if test -n "$ac_tool_prefix"; then
-  for ac_prog in ar lib "link -lib"
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_AR+:} false; then :
+c11_standard=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -std=gnu11" >&5
+$as_echo_n "checking whether C compiler accepts -std=gnu11... " >&6; }
+if ${ax_cv_check_cflags___std_gnu11+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$AR"; then
-  ac_cv_prog_AR="$AR" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_AR="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
 
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  -std=gnu11"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_check_cflags___std_gnu11=yes
+else
+  ax_cv_check_cflags___std_gnu11=no
 fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
 fi
-AR=$ac_cv_prog_AR
-if test -n "$AR"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AR" >&5
-$as_echo "$AR" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___std_gnu11" >&5
+$as_echo "$ax_cv_check_cflags___std_gnu11" >&6; }
+if test "x$ax_cv_check_cflags___std_gnu11" = xyes; then :
+
+  c11_standard=yes
+
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
 
+  echo "C compiler cannot compile -std=gnu11 code"
+  echo "testing -std=c11.."
 
-    test -n "$AR" && break
-  done
 fi
-if test -z "$AR"; then
-  ac_ct_AR=$AR
-  for ac_prog in ar lib "link -lib"
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_AR+:} false; then :
+
+if test x"$c11_standard" = x"yes"; then
+  CFLAGS+=" -std=gnu11"
+fi
+
+if test x"$c11_standard" = x"no"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -std=c11" >&5
+$as_echo_n "checking whether C compiler accepts -std=c11... " >&6; }
+if ${ax_cv_check_cflags___std_c11+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$ac_ct_AR"; then
-  ac_cv_prog_ac_ct_AR="$ac_ct_AR" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_AR="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
 
-fi
-fi
-ac_ct_AR=$ac_cv_prog_ac_ct_AR
-if test -n "$ac_ct_AR"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AR" >&5
-$as_echo "$ac_ct_AR" >&6; }
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  -std=c11"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_check_cflags___std_c11=yes
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  ax_cv_check_cflags___std_c11=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___std_c11" >&5
+$as_echo "$ax_cv_check_cflags___std_c11" >&6; }
+if test "x$ax_cv_check_cflags___std_c11" = xyes; then :
 
+    c11_standard=yes
 
-  test -n "$ac_ct_AR" && break
-done
+else
 
-  if test "x$ac_ct_AR" = x; then
-    AR="false"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    AR=$ac_ct_AR
+    echo "C compiler cannot compile C11 code"
+    exit -1
+
+fi
+
+  if test x"$c11_standard" = x"yes"; then
+    CFLAGS+=" -std=c11"
   fi
 fi
 
-: ${AR=ar}
+# Make sure we can run config.sub.
+$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
+  as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking the archiver ($AR) interface" >&5
-$as_echo_n "checking the archiver ($AR) interface... " >&6; }
-if ${am_cv_ar_interface+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
+$as_echo_n "checking build system type... " >&6; }
+if ${ac_cv_build+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  ac_ext=c
+  ac_build_alias=$build_alias
+test "x$ac_build_alias" = x &&
+  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
+test "x$ac_build_alias" = x &&
+  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
+ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
+  as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
+$as_echo "$ac_cv_build" >&6; }
+case $ac_cv_build in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
+esac
+build=$ac_cv_build
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_build
+shift
+build_cpu=$1
+build_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+build_os=$*
+IFS=$ac_save_IFS
+case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
+$as_echo_n "checking host system type... " >&6; }
+if ${ac_cv_host+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "x$host_alias" = x; then
+  ac_cv_host=$ac_cv_build
+else
+  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
+    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
+$as_echo "$ac_cv_host" >&6; }
+case $ac_cv_host in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
+esac
+host=$ac_cv_host
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_host
+shift
+host_cpu=$1
+host_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+host_os=$*
+IFS=$ac_save_IFS
+case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
+
+
+
+ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-   am_cv_ar_interface=ar
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for x86 cpuid  output" >&5
+$as_echo_n "checking for x86 cpuid  output... " >&6; }
+if ${ax_cv_gcc_x86_cpuid_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  ax_cv_gcc_x86_cpuid_=unknown
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
-int some_variable = 0;
+#include <stdio.h>
+int
+main ()
+{
+
+     int op = , level = 0, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__ __volatile__ ("xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n"
+        : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op), "2" (level));
+
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
+
+  ;
+  return 0;
+}
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  am_ar_try='$AR cru libconftest.a conftest.$ac_objext >&5'
-      { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$am_ar_try\""; } >&5
-  (eval $am_ar_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      if test "$ac_status" -eq 0; then
-        am_cv_ar_interface=ar
-      else
-        am_ar_try='$AR -NOLOGO -OUT:conftest.lib conftest.$ac_objext >&5'
-        { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$am_ar_try\""; } >&5
-  (eval $am_ar_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-        if test "$ac_status" -eq 0; then
-          am_cv_ar_interface=lib
-        else
-          am_cv_ar_interface=unknown
-        fi
-      fi
-      rm -f conftest.lib libconftest.a
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_gcc_x86_cpuid_=`cat conftest_cpuid`; rm -f conftest_cpuid
+else
+  ax_cv_gcc_x86_cpuid_=unknown; rm -f conftest_cpuid
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
 
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_ext=c
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_x86_cpuid_" >&5
+$as_echo "$ax_cv_gcc_x86_cpuid_" >&6; }
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
+ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for x86-AVX xgetbv  output" >&5
+$as_echo_n "checking for x86-AVX xgetbv  output... " >&6; }
+if ${ax_cv_gcc_x86_avx_xgetbv_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  ax_cv_gcc_x86_avx_xgetbv_=unknown
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
+
+     int op = , eax, edx;
+     FILE *f;
+      /* Opcodes for xgetbv */
+      __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0"
+        : "=a" (eax), "=d" (edx)
+        : "c" (op));
+     f = fopen("conftest_xgetbv", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x\n", eax, edx);
+     fclose(f);
+     return 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_gcc_x86_avx_xgetbv_=`cat conftest_xgetbv`; rm -f conftest_xgetbv
+else
+  ax_cv_gcc_x86_avx_xgetbv_=unknown; rm -f conftest_xgetbv
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_ar_interface" >&5
-$as_echo "$am_cv_ar_interface" >&6; }
 
-case $am_cv_ar_interface in
-ar)
-  ;;
-lib)
-  # Microsoft lib, so override with the ar-lib wrapper script.
-  # FIXME: It is wrong to rewrite AR.
-  # But if we don't then we get into trouble of one sort or another.
-  # A longer-term fix would be to have automake use am__AR in this case,
-  # and then we could set am__AR="$am_aux_dir/ar-lib \$(AR)" or something
-  # similar.
-  AR="$am_aux_dir/ar-lib $AR"
-  ;;
-unknown)
-  as_fn_error $? "could not determine $AR interface" "$LINENO" 5
-  ;;
-esac
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_x86_avx_xgetbv_" >&5
+$as_echo "$ax_cv_gcc_x86_avx_xgetbv_" >&6; }
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-# By default we simply use the C compiler to build assembly code.
 
-test "${CCAS+set}" = set || CCAS=$CC
-test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
 
 
 
-depcc="$CCAS"   am_compiler_list=
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CCAS_dependencies_compiler_type+:} false; then :
+  CPUEXT_FLAGS=""
+  SIMD_FLAGS=""
+
+  case $host_cpu in
+    powerpc*)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether altivec is supported for old distros" >&5
+$as_echo_n "checking whether altivec is supported for old distros... " >&6; }
+if ${ax_cv_have_altivec_old_ext+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named 'D' -- because '-MD' means "put the output
-  # in D".
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
 
-  am_cv_CCAS_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  am__universal=false
+            if test `/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.optional.altivec` != 0; then
+                if test `/usr/sbin/sysctl -n hw.optional.altivec` = 1; then
+                  ax_cv_have_altivec_old_ext=yes
+                fi
+            fi
 
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_have_altivec_old_ext" >&5
+$as_echo "$ax_cv_have_altivec_old_ext" >&6; }
 
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
-      # Solaris 10 /bin/sh.
-      echo '/* dummy */' > sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+          if test "$ax_cv_have_altivec_old_ext" = yes; then
 
-    # We check with '-c' and '-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle '-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs.
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # After this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested.
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok '-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CCAS_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
+$as_echo "#define HAVE_ALTIVEC /**/" >>confdefs.h
 
-  cd ..
-  rm -rf conftest.dir
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -faltivec" >&5
+$as_echo_n "checking whether C compiler accepts -faltivec... " >&6; }
+if ${ax_cv_check_cflags___faltivec+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-  am_cv_CCAS_dependencies_compiler_type=none
-fi
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; }
-CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  -faltivec"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CCAS_dependencies_compiler_type" = gcc3; then
-  am__fastdepCCAS_TRUE=
-  am__fastdepCCAS_FALSE='#'
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_check_cflags___faltivec=yes
 else
-  am__fastdepCCAS_TRUE='#'
-  am__fastdepCCAS_FALSE=
+  ax_cv_check_cflags___faltivec=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___faltivec" >&5
+$as_echo "$ax_cv_check_cflags___faltivec" >&6; }
+if test "x$ax_cv_check_cflags___faltivec" = xyes; then :
+  SIMD_FLAGS="$SIMD_FLAGS -faltivec"
+else
+  :
 fi
 
+          fi
 
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether altivec is supported" >&5
+$as_echo_n "checking whether altivec is supported... " >&6; }
+if ${ax_cv_have_altivec_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
 
-# Fortran
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+            if test `LD_SHOW_AUXV=1 /bin/true 2>/dev/null|grep -c altivec` != 0; then
+              ax_cv_have_altivec_ext=yes
+            fi
 
-# ===========================================================================
-#      http://www.gnu.org/software/autoconf-archive/ax_prog_fc_mpi.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-#   AX_PROG_FC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]])
-#
-# DESCRIPTION
-#
-#   This macro tries to find out how to compile Fortran77 programs that use
-#   MPI (Message Passing Interface), a standard API for parallel process
-#   communication (see http://www-unix.mcs.anl.gov/mpi/).  The macro has to
-#   be used instead of the standard macro AC_PROG_FC and will replace the
-#   standard variable FC with the found compiler.
-#
-#   MPI-WANTED-TEST is used to test whether MPI is actually wanted by the
-#   user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will
-#   try to find out how to use MPI, if it fails, the macro will call
-#   AC_PROG_CC to find a standard C compiler instead.
-#
-#   When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found
-#   (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If
-#   ACTION-IF-FOUND is not set, the macro will define HAVE_MPI.
-#
-#   The following example demonstrates usage of the macro:
-#
-#     # If --with-mpi=auto is used, try to find MPI, but use standard FC compiler if it is not found.
-#     # If --with-mpi=yes is used, try to find MPI and fail if it isn't found.
-#     # If --with-mpi=no is used, use a standard FC compiler instead.
-#     AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi],
-#         [compile with MPI (parallelization) support. If none is found,
-#         MPI is not used. Default: auto])
-#     ],,[with_mpi=auto])
-#
-#     AX_PROG_FC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[
-#       use_mpi=no
-#       if test x"$with_mpi" = xyes; then
-#         AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.])
-#       else
-#         AC_MSG_WARN([No MPI compiler found, won't use MPI.])
-#       fi
-#     ])
-#
-# LICENSE
-#
-#   Copyright (c) 2010,2011 Olaf Lenz <olenz@icp.uni-stuttgart.de>
-#
-#   This program is free software: you can redistribute it and/or modify it
-#   under the terms of the GNU General Public License as published by the
-#   Free Software Foundation, either version 3 of the License, or (at your
-#   option) any later version.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-#   Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-#   As a special exception, the respective Autoconf Macro's copyright owner
-#   gives unlimited permission to copy, distribute and modify the configure
-#   scripts that are the output of Autoconf when processing the Macro. You
-#   need not follow the terms of the GNU General Public License when using
-#   or distributing such scripts, even though portions of the text of the
-#   Macro appear in them. The GNU General Public License (GPL) does govern
-#   all other use of the material that constitutes the Autoconf Macro.
-#
-#   This special exception to the GPL applies to versions of the Autoconf
-#   Macro released by the Autoconf Archive. When you make and distribute a
-#   modified version of the Autoconf Macro, you may extend this special
-#   exception to the GPL to apply to your modified version as well.
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_have_altivec_ext" >&5
+$as_echo "$ax_cv_have_altivec_ext" >&6; }
 
-#serial 2
+          if test "$ax_cv_have_altivec_ext" = yes; then
 
+$as_echo "#define HAVE_ALTIVEC /**/" >>confdefs.h
 
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -maltivec" >&5
+$as_echo_n "checking whether C compiler accepts -maltivec... " >&6; }
+if ${ax_cv_check_cflags___maltivec+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
 
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  -maltivec"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+int
+main ()
+{
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to compile using MPI" >&5
-$as_echo_n "checking whether to compile using MPI... " >&6; }
-    if test x"$with_mpi" = x"yes"; then
-      _ax_prog_fc_mpi_mpi_wanted=yes
-    else
-      _ax_prog_fc_mpi_mpi_wanted=no
-    fi
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_ax_prog_fc_mpi_mpi_wanted" >&5
-$as_echo "$_ax_prog_fc_mpi_mpi_wanted" >&6; }
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_check_cflags___maltivec=yes
+else
+  ax_cv_check_cflags___maltivec=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___maltivec" >&5
+$as_echo "$ax_cv_check_cflags___maltivec" >&6; }
+if test "x$ax_cv_check_cflags___maltivec" = xyes; then :
+  SIMD_FLAGS="$SIMD_FLAGS -maltivec"
+else
+  :
+fi
 
-  if test x"$_ax_prog_fc_mpi_mpi_wanted" = xyes; then
-    if test -n "$ac_tool_prefix"; then
-  for ac_prog in mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_FC+:} false; then :
+          fi
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether vsx is supported" >&5
+$as_echo_n "checking whether vsx is supported... " >&6; }
+if ${ax_cv_have_vsx_ext+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$FC"; then
-  ac_cv_prog_FC="$FC" # Let the user override the test.
+
+            if test `LD_SHOW_AUXV=1 /bin/true 2>/dev/null|grep -c vsx` != 0; then
+                ax_cv_have_vsx_ext=yes
+            fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_have_vsx_ext" >&5
+$as_echo "$ax_cv_have_vsx_ext" >&6; }
+
+          if test "$ax_cv_have_vsx_ext" = yes; then
+
+$as_echo "#define HAVE_VSX /**/" >>confdefs.h
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mvsx" >&5
+$as_echo_n "checking whether C compiler accepts -mvsx... " >&6; }
+if ${ax_cv_check_cflags___mvsx+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_FC="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
 
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  -mvsx"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_check_cflags___mvsx=yes
+else
+  ax_cv_check_cflags___mvsx=no
 fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
 fi
-FC=$ac_cv_prog_FC
-if test -n "$FC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $FC" >&5
-$as_echo "$FC" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___mvsx" >&5
+$as_echo "$ax_cv_check_cflags___mvsx" >&6; }
+if test "x$ax_cv_check_cflags___mvsx" = xyes; then :
+  SIMD_FLAGS="$SIMD_FLAGS -mvsx"
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  :
 fi
 
+          fi
+    ;;
 
-    test -n "$FC" && break
-  done
-fi
-if test -z "$FC"; then
-  ac_ct_FC=$FC
-  for ac_prog in mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_FC+:} false; then :
+    i[3456]86*|x86_64*|amd64*)
+
+
+
+
+
+      eax_cpuid0=0
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for x86 cpuid 0x00000000 output" >&5
+$as_echo_n "checking for x86 cpuid 0x00000000 output... " >&6; }
+if ${ax_cv_gcc_x86_cpuid_0x00000000+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$ac_ct_FC"; then
-  ac_cv_prog_ac_ct_FC="$ac_ct_FC" # Let the user override the test.
+  if test "$cross_compiling" = yes; then :
+  ax_cv_gcc_x86_cpuid_0x00000000=unknown
 else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_FC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
+
+     int op = 0x00000000, level = 0, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__ __volatile__ ("xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n"
+        : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op), "2" (level));
+
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
 
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_gcc_x86_cpuid_0x00000000=`cat conftest_cpuid`; rm -f conftest_cpuid
+else
+  ax_cv_gcc_x86_cpuid_0x00000000=unknown; rm -f conftest_cpuid
 fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
-ac_ct_FC=$ac_cv_prog_ac_ct_FC
-if test -n "$ac_ct_FC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_FC" >&5
-$as_echo "$ac_ct_FC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_x86_cpuid_0x00000000" >&5
+$as_echo "$ax_cv_gcc_x86_cpuid_0x00000000" >&6; }
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-  test -n "$ac_ct_FC" && break
-done
 
-  if test "x$ac_ct_FC" = x; then
-    FC=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    FC=$ac_ct_FC
-  fi
-fi
+      if test "$ax_cv_gcc_x86_cpuid_0x00000000" != "unknown";
+      then
+        eax_cpuid0=`echo $ax_cv_gcc_x86_cpuid_0x00000000 | cut -d ":" -f 1`
+      fi
 
-  fi
-  ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
-if test -n "$ac_tool_prefix"; then
-  for ac_prog in gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor xlf90 f90 pgf90 pghpf epcf90 g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_FC+:} false; then :
+      eax_cpuid80000000=0
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for x86 cpuid 0x80000000 output" >&5
+$as_echo_n "checking for x86 cpuid 0x80000000 output... " >&6; }
+if ${ax_cv_gcc_x86_cpuid_0x80000000+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$FC"; then
-  ac_cv_prog_FC="$FC" # Let the user override the test.
+  if test "$cross_compiling" = yes; then :
+  ax_cv_gcc_x86_cpuid_0x80000000=unknown
 else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_FC="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
 
+     int op = 0x80000000, level = 0, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__ __volatile__ ("xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n"
+        : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op), "2" (level));
+
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_gcc_x86_cpuid_0x80000000=`cat conftest_cpuid`; rm -f conftest_cpuid
+else
+  ax_cv_gcc_x86_cpuid_0x80000000=unknown; rm -f conftest_cpuid
 fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
-FC=$ac_cv_prog_FC
-if test -n "$FC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $FC" >&5
-$as_echo "$FC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_x86_cpuid_0x80000000" >&5
+$as_echo "$ax_cv_gcc_x86_cpuid_0x80000000" >&6; }
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-    test -n "$FC" && break
-  done
-fi
-if test -z "$FC"; then
-  ac_ct_FC=$FC
-  for ac_prog in gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor xlf90 f90 pgf90 pghpf epcf90 g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_FC+:} false; then :
+
+      if test "$ax_cv_gcc_x86_cpuid_0x80000000" != "unknown";
+      then
+        eax_cpuid80000000=`echo $ax_cv_gcc_x86_cpuid_0x80000000 | cut -d ":" -f 1`
+      fi
+
+      ecx_cpuid1=0
+      edx_cpuid1=0
+      if test "$((0x$eax_cpuid0))" -ge 1 ; then
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for x86 cpuid 0x00000001 output" >&5
+$as_echo_n "checking for x86 cpuid 0x00000001 output... " >&6; }
+if ${ax_cv_gcc_x86_cpuid_0x00000001+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$ac_ct_FC"; then
-  ac_cv_prog_ac_ct_FC="$ac_ct_FC" # Let the user override the test.
+  if test "$cross_compiling" = yes; then :
+  ax_cv_gcc_x86_cpuid_0x00000001=unknown
 else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_FC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
 
+     int op = 0x00000001, level = 0, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__ __volatile__ ("xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n"
+        : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op), "2" (level));
+
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_gcc_x86_cpuid_0x00000001=`cat conftest_cpuid`; rm -f conftest_cpuid
+else
+  ax_cv_gcc_x86_cpuid_0x00000001=unknown; rm -f conftest_cpuid
 fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
-ac_ct_FC=$ac_cv_prog_ac_ct_FC
-if test -n "$ac_ct_FC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_FC" >&5
-$as_echo "$ac_ct_FC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_x86_cpuid_0x00000001" >&5
+$as_echo "$ax_cv_gcc_x86_cpuid_0x00000001" >&6; }
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-  test -n "$ac_ct_FC" && break
-done
 
-  if test "x$ac_ct_FC" = x; then
-    FC=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    FC=$ac_ct_FC
-  fi
-fi
+        if test "$ax_cv_gcc_x86_cpuid_0x00000001" != "unknown";
+        then
+          ecx_cpuid1=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3`
+          edx_cpuid1=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4`
+        fi
+      fi
 
+      ebx_cpuid7=0
+      ecx_cpuid7=0
+      if test "$((0x$eax_cpuid0))" -ge 7 ; then
 
-# Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran compiler version" >&5
-set X $ac_compile
-ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
-  { { ac_try="$ac_compiler $ac_option >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    sed '10a\
-... rest of stderr output deleted ...
-         10q' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-  fi
-  rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-done
-rm -f a.out
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-# If we don't use `.F' as extension, the preprocessor is not run on the
-# input file.  (Note that this only needs to work for GNU compilers.)
-ac_save_ext=$ac_ext
-ac_ext=F
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU Fortran compiler" >&5
-$as_echo_n "checking whether we are using the GNU Fortran compiler... " >&6; }
-if ${ac_cv_fc_compiler_gnu+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for x86 cpuid 0x00000007 output" >&5
+$as_echo_n "checking for x86 cpuid 0x00000007 output... " >&6; }
+if ${ax_cv_gcc_x86_cpuid_0x00000007+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  cat > conftest.$ac_ext <<_ACEOF
-      program main
-#ifndef __GNUC__
-       choke me
-#endif
+  if test "$cross_compiling" = yes; then :
+  ax_cv_gcc_x86_cpuid_0x00000007=unknown
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
 
-      end
+     int op = 0x00000007, level = 0x00, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__ __volatile__ ("xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n"
+        : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op), "2" (level));
+
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
+
+  ;
+  return 0;
+}
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  ac_compiler_gnu=yes
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_gcc_x86_cpuid_0x00000007=`cat conftest_cpuid`; rm -f conftest_cpuid
 else
-  ac_compiler_gnu=no
+  ax_cv_gcc_x86_cpuid_0x00000007=unknown; rm -f conftest_cpuid
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-ac_cv_fc_compiler_gnu=$ac_compiler_gnu
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_compiler_gnu" >&5
-$as_echo "$ac_cv_fc_compiler_gnu" >&6; }
-ac_ext=$ac_save_ext
-ac_test_FCFLAGS=${FCFLAGS+set}
-ac_save_FCFLAGS=$FCFLAGS
-FCFLAGS=
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $FC accepts -g" >&5
-$as_echo_n "checking whether $FC accepts -g... " >&6; }
-if ${ac_cv_prog_fc_g+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_x86_cpuid_0x00000007" >&5
+$as_echo "$ax_cv_gcc_x86_cpuid_0x00000007" >&6; }
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+        if test "$ax_cv_gcc_x86_cpuid_0x00000007" != "unknown";
+        then
+          ebx_cpuid7=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 2`
+          ecx_cpuid7=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 3`
+        fi
+      fi
+
+      ecx_cpuid80000001=0
+      edx_cpuid80000001=0
+      if test "$((0x$eax_cpuid80000000))" -ge "$((0x80000001))" ; then
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for x86 cpuid 0x80000001 output" >&5
+$as_echo_n "checking for x86 cpuid 0x80000001 output... " >&6; }
+if ${ax_cv_gcc_x86_cpuid_0x80000001+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  FCFLAGS=-g
-cat > conftest.$ac_ext <<_ACEOF
-      program main
+  if test "$cross_compiling" = yes; then :
+  ax_cv_gcc_x86_cpuid_0x80000001=unknown
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
 
-      end
+     int op = 0x80000001, level = 0, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__ __volatile__ ("xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n"
+        : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op), "2" (level));
+
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
+
+  ;
+  return 0;
+}
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  ac_cv_prog_fc_g=yes
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_gcc_x86_cpuid_0x80000001=`cat conftest_cpuid`; rm -f conftest_cpuid
 else
-  ac_cv_prog_fc_g=no
+  ax_cv_gcc_x86_cpuid_0x80000001=unknown; rm -f conftest_cpuid
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_g" >&5
-$as_echo "$ac_cv_prog_fc_g" >&6; }
-if test "$ac_test_FCFLAGS" = set; then
-  FCFLAGS=$ac_save_FCFLAGS
-elif test $ac_cv_prog_fc_g = yes; then
-  if test "x$ac_cv_fc_compiler_gnu" = xyes; then
-    FCFLAGS="-g -O2"
-  else
-    FCFLAGS="-g"
-  fi
-else
-  if test "x$ac_cv_fc_compiler_gnu" = xyes; then
-    FCFLAGS="-O2"
-  else
-    FCFLAGS=
-  fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
 
-if test $ac_compiler_gnu = yes; then
-  GFC=yes
-else
-  GFC=
 fi
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_x86_cpuid_0x80000001" >&5
+$as_echo "$ax_cv_gcc_x86_cpuid_0x80000001" >&6; }
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 
+        if test "$ax_cv_gcc_x86_cpuid_0x80000001" != "unknown";
+        then
+          ecx_cpuid80000001=`echo $ax_cv_gcc_x86_cpuid_0x80000001 | cut -d ":" -f 3`
+          edx_cpuid80000001=`echo $ax_cv_gcc_x86_cpuid_0x80000001 | cut -d ":" -f 4`
+        fi
+      fi
 
+      if ${ax_cv_have_mmx_os_support_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
 
+        ax_cv_have_mmx_os_support_ext=yes
 
-# Check for compiler
-# Needs to be split off into an extra macro to ensure right expansion
-# order.
+fi
 
 
-if test x"$_ax_prog_fc_mpi_mpi_wanted" = xno; then :
-   _ax_prog_fc_mpi_mpi_found=no
+      ax_cv_have_none_os_support_ext=yes
+
+      if ${ax_cv_have_sse_os_support_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
 
-    ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+        ax_cv_have_sse_os_support_ext=no,
+        if test "$((0x$edx_cpuid1>>25&0x01))" = 1; then
+          ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+          if test "$cross_compiling" = yes; then :
+  ax_cv_have_sse_os_support_ext=no
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-    # test whether MPI_INIT is available
-    # We do not use AC_SEARCH_LIBS here, as it caches its outcome and
-    # thus disallows corresponding calls in the other AX_PROG_*_MPI
-    # macros.
-    for lib in NONE mpichf90 fmpi fmpich; do
-      save_LIBS=$LIBS
-      if test x"$lib" = xNONE; then
-        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for function MPI_INIT" >&5
-$as_echo_n "checking for function MPI_INIT... " >&6; }
-      else
-        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for function MPI_INIT in -l$lib" >&5
-$as_echo_n "checking for function MPI_INIT in -l$lib... " >&6; }
-        LIBS="-l$lib $LIBS"
-      fi
-      cat > conftest.$ac_ext <<_ACEOF
-      program main
-      call MPI_INIT
-      end
+#include <signal.h>
+#include <stdlib.h>
+            /* No way at ring1 to ring3 in protected mode to check the CR0 and CR4
+               control registers directly. Execute an SSE instruction.
+               If it raises SIGILL then OS doesn't support SSE based instructions */
+            void sig_handler(int signum){ exit(1); }
+            int main(){
+              signal(SIGILL, sig_handler);
+              /* SSE instruction xorps  %xmm0,%xmm0 */
+              __asm__ __volatile__ (".byte 0x0f, 0x57, 0xc0");
+              return 0;
+            }
 _ACEOF
-if ac_fn_fc_try_link "$LINENO"; then :
-   _ax_prog_fc_mpi_mpi_found=yes
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_have_sse_os_support_ext=yes
 else
-   _ax_prog_fc_mpi_mpi_found=no
+  ax_cv_have_sse_os_support_ext=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-      { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_ax_prog_fc_mpi_mpi_found" >&5
-$as_echo "$_ax_prog_fc_mpi_mpi_found" >&6; }
-      if test "x$_ax_prog_fc_mpi_mpi_found" = "xyes"; then
-        break;
-      fi
-      LIBS=$save_LIBS
-    done
 
-    # Check for header
-    if test x"$_ax_prog_fc_mpi_mpi_found" = xyes; then :
+          ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for mpif.h" >&5
-$as_echo_n "checking for mpif.h... " >&6; }
-      cat > conftest.$ac_ext <<_ACEOF
-      program main
+        fi
 
-      include 'mpif.h'
+fi
 
-      end
-_ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-   { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+
+      xgetbv_eax=0
+      if test "$((0x$ecx_cpuid1>>28&0x01))" = 1; then
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for x86-AVX xgetbv 0x00000000 output" >&5
+$as_echo_n "checking for x86-AVX xgetbv 0x00000000 output... " >&6; }
+if ${ax_cv_gcc_x86_avx_xgetbv_0x00000000+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-	  _ax_prog_fc_mpi_mpi_found=no
+  if test "$cross_compiling" = yes; then :
+  ax_cv_gcc_x86_avx_xgetbv_0x00000000=unknown
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
 
+     int op = 0x00000000, eax, edx;
+     FILE *f;
+      /* Opcodes for xgetbv */
+      __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0"
+        : "=a" (eax), "=d" (edx)
+        : "c" (op));
+     f = fopen("conftest_xgetbv", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x\n", eax, edx);
+     fclose(f);
+     return 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ax_cv_gcc_x86_avx_xgetbv_0x00000000=`cat conftest_xgetbv`; rm -f conftest_xgetbv
+else
+  ax_cv_gcc_x86_avx_xgetbv_0x00000000=unknown; rm -f conftest_xgetbv
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 fi
-    ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_gcc_x86_avx_xgetbv_0x00000000" >&5
+$as_echo "$ax_cv_gcc_x86_avx_xgetbv_0x00000000" >&6; }
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-fi
 
-# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
-if test x"$_ax_prog_fc_mpi_mpi_found" = xyes; then :
+        if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then
+          xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1`
+        fi
 
-        found_mpi_f=yes
-        :
+        if ${ax_cv_have_avx_os_support_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+          ax_cv_have_avx_os_support_ext=no
+          if test "$((0x$ecx_cpuid1>>27&0x01))" = 1; then
+            if test "$((0x$xgetbv_eax&0x6))" = 6; then
+              ax_cv_have_avx_os_support_ext=yes
+            fi
+          fi
+
+fi
+
+      fi
 
+      if ${ax_cv_have_avx512_os_support_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
 
-        found_mpi_f=no
-        :
+        ax_cv_have_avx512_os_support_ext=no
+        if test "$ax_cv_have_avx_os_support_ext" = yes; then
+          if test "$((0x$xgetbv_eax&0xe6))" = "$((0xe6))"; then
+            ax_cv_have_avx512_os_support_ext=yes
+          fi
+        fi
 
 fi
 
 
-if test x"$with_mpi" = x"yes"; then
-  if test x"$found_mpi_f" = x"no"; then
-    as_fn_error $? "Could not compile an MPI Fortran program" "$LINENO" 5
-  fi
-fi
-if test x"${enable_openmp}" = x"yes"; then
-
-  OPENMP_FCFLAGS=
-  enable_openmp="yes"
-  if test "$enable_openmp" != no; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for fc option to support OpenMP" >&5
-$as_echo_n "checking for fc option to support OpenMP... " >&6; }
-if ${ac_cv_prog_fc_openmp+:} false; then :
+      for ac_instr_info       in "none;rdrnd;RDRND;ecx_cpuid1,30;-mrdrnd;HAVE_RDRND;CPUEXT_FLAGS"          "none;bmi1;BMI1;ebx_cpuid7,3;-mbmi;HAVE_BMI1;CPUEXT_FLAGS"          "none;bmi2;BMI2;ebx_cpuid7,8;-mbmi2;HAVE_BMI2;CPUEXT_FLAGS"          "none;adx;ADX;ebx_cpuid7,19;-madx;HAVE_ADX;CPUEXT_FLAGS"          "none;mpx;MPX;ebx_cpuid7,14;-mmpx;HAVE_MPX;CPUEXT_FLAGS"          "none;prefetchwt1;PREFETCHWT1;ecx_cpuid7,0;-mprefetchwt1;HAVE_PREFETCHWT1;CPUEXT_FLAGS"          "none;abm;ABM;ecx_cpuid80000001,5;-mabm;HAVE_ABM;CPUEXT_FLAGS"          "mmx;mmx;MMX;edx_cpuid1,23;-mmmx;HAVE_MMX;SIMD_FLAGS"          "sse;sse;SSE;edx_cpuid1,25;-msse;HAVE_SSE;SIMD_FLAGS"          "sse;sse2;SSE2;edx_cpuid1,26;-msse2;HAVE_SSE2;SIMD_FLAGS"          "sse;sse3;SSE3;ecx_cpuid1,1;-msse3;HAVE_SSE3;SIMD_FLAGS"          "sse;ssse3;SSSE3;ecx_cpuid1,9;-mssse3;HAVE_SSSE3;SIMD_FLAGS"          "sse;sse41;SSE4.1;ecx_cpuid1,19;-msse4.1;HAVE_SSE4_1;SIMD_FLAGS"          "sse;sse42;SSE4.2;ecx_cpuid1,20;-msse4.2;HAVE_SSE4_2;SIMD_FLAGS"          "sse;sse4a;SSE4a;ecx_cpuid80000001,6;-msse4a;HAVE_SSE4a;SIMD_FLAGS"          "sse;sha;SHA;ebx_cpuid7,29;-msha;HAVE_SHA;SIMD_FLAGS"          "sse;aes;AES;ecx_cpuid1,25;-maes;HAVE_AES;SIMD_FLAGS"          "avx;avx;AVX;ecx_cpuid1,28;-mavx;HAVE_AVX;SIMD_FLAGS"          "avx;fma3;FMA3;ecx_cpuid1,12;-mfma;HAVE_FMA3;SIMD_FLAGS"          "avx;fma4;FMA4;ecx_cpuid80000001,16;-mfma4;HAVE_FMA4;SIMD_FLAGS"          "avx;xop;XOP;ecx_cpuid80000001,11;-mxop;HAVE_XOP;SIMD_FLAGS"          "avx;avx2;AVX2;ebx_cpuid7,5;-mavx2;HAVE_AVX2;SIMD_FLAGS"          "avx512;avx512f;AVX512-F;ebx_cpuid7,16;-mavx512f;HAVE_AVX512_F;SIMD_FLAGS"          "avx512;avx512cd;AVX512-CD;ebx_cpuid7,28;-mavx512cd;HAVE_AVX512_CD;SIMD_FLAGS"          "avx512;avx512pf;AVX512-PF;ebx_cpuid7,26;-mavx512pf;HAVE_AVX512_PF;SIMD_FLAGS"          "avx512;avx512er;AVX512-ER;ebx_cpuid7,27;-mavx512er;HAVE_AVX512_ER;SIMD_FLAGS"          "avx512;avx512vl;AVX512-VL;ebx_cpuid7,31;-mavx512vl;HAVE_AVX512_VL;SIMD_FLAGS"          "avx512;avx512bw;AVX512-BW;ebx_cpuid7,30;-mavx512bw;HAVE_AVX512_BW;SIMD_FLAGS"          "avx512;avx512dq;AVX512-DQ;ebx_cpuid7,17;-mavx512dq;HAVE_AVX512_DQ;SIMD_FLAGS"          "avx512;avx512ifma;AVX512-IFMA;ebx_cpuid7,21;-mavx512ifma;HAVE_AVX512_IFMA;SIMD_FLAGS"          "avx512;avx512vbmi;AVX512-VBMI;ecx_cpuid7,1;-mavx512vbmi;HAVE_AVX512_VBMI;SIMD_FLAGS"          #
+      do ac_instr_os_support=$(eval echo \$ax_cv_have_$(echo $ac_instr_info | cut -d ";" -f 1)_os_support_ext)
+         ac_instr_acvar=$(echo $ac_instr_info | cut -d ";" -f 2)
+         ac_instr_shortname=$(echo $ac_instr_info | cut -d ";" -f 3)
+         ac_instr_chk_loc=$(echo $ac_instr_info | cut -d ";" -f 4)
+         ac_instr_chk_reg=0x$(eval echo \$$(echo $ac_instr_chk_loc | cut -d "," -f 1))
+         ac_instr_chk_bit=$(echo $ac_instr_chk_loc | cut -d "," -f 2)
+         ac_instr_compiler_flags=$(echo $ac_instr_info | cut -d ";" -f 5)
+         ac_instr_have_define=$(echo $ac_instr_info | cut -d ";" -f 6)
+         ac_instr_flag_type=$(echo $ac_instr_info | cut -d ";" -f 7)
+
+         { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${ac_instr_shortname} is supported by the processor" >&5
+$as_echo_n "checking whether ${ac_instr_shortname} is supported by the processor... " >&6; }
+if eval \${ax_cv_have_${ac_instr_acvar}_cpu_ext+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  cat > conftest.$ac_ext <<_ACEOF
-
-      program test_openmp
-       use omp_lib
-       implicit none
-!$     integer :: foobar
-       foobar = omp_get_num_threads()
-      end program
-
-_ACEOF
-if ac_fn_fc_try_link "$LINENO"; then :
-  ac_cv_prog_fc_openmp='none needed'
-else
-  ac_cv_prog_fc_openmp='unsupported'
-	  	  	  	  	  	  	  	  	  	  	  	  	  for ac_option in -openmp -fopenmp -xopenmp -mp -omp -qsmp=omp; do
-	    ac_save_FCFLAGS=$FCFLAGS
-	    FCFLAGS="$FCFLAGS $ac_option"
-	    cat > conftest.$ac_ext <<_ACEOF
-
-      program test_openmp
-       use omp_lib
-       implicit none
-!$     integer :: foobar
-       foobar = omp_get_num_threads()
-      end program
-
-_ACEOF
-if ac_fn_fc_try_link "$LINENO"; then :
-  ac_cv_prog_fc_openmp=$ac_option
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-	    FCFLAGS=$ac_save_FCFLAGS
-	    if test "$ac_cv_prog_fc_openmp" != unsupported; then
-	      break
-	    fi
-	  done
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_openmp" >&5
-$as_echo "$ac_cv_prog_fc_openmp" >&6; }
-    case $ac_cv_prog_fc_openmp in #(
-      "none needed" | unsupported)
-        ;; #(
-      *)
-        OPENMP_FCFLAGS=$ac_cv_prog_fc_openmp ;;
-    esac
-  fi
-
-
-  if test "$ac_cv_prog_fc_openmp" = unsupported; then
-    as_fn_error $? "Could not compile a Fortran program with OpenMP, adjust FCFLAGS" "$LINENO" 5
-  fi
-  FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
-fi
-
-## C++
-#AC_LANG([C++])
-#AC_PROG_CXX
-#
-#if test x"${enable_openmp}" = x"yes"; then
-#  AX_ELPA_OPENMP
-#  if test "$ac_cv_prog_cxx_openmp" = unsupported; then
-#    AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
-#  fi
-#  CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
-#fi
-
-
-
-
-
-
-install_real_generic=yes
-install_real_generic_simple=yes
-
-install_complex_generic=yes
-install_complex_generic_simple=yes
-
-#want_avx=yes
-#want_avx2=yes
-#want_sse=yes
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ELPA should be build with ftimings support" >&5
-$as_echo_n "checking whether ELPA should be build with ftimings support... " >&6; }
-
-# Check whether --with-ftimings was given.
-if test "${with_ftimings+set}" = set; then :
-  withval=$with_ftimings; with_ftimings=yes
-else
-  with_ftimings=no
-fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${with_ftimings}" >&5
-$as_echo "${with_ftimings}" >&6; }
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether stdout/stderr file redirect should be enabled" >&5
-$as_echo_n "checking whether stdout/stderr file redirect should be enabled... " >&6; }
+           eval ax_cv_have_${ac_instr_acvar}_cpu_ext=no
+           if test "$((${ac_instr_chk_reg}>>${ac_instr_chk_bit}&0x01))" = 1 ; then
+             eval ax_cv_have_${ac_instr_acvar}_cpu_ext=yes
+           fi
 
-# Check whether --with-redirect was given.
-if test "${with_redirect+set}" = set; then :
-  withval=$with_redirect; with_redirect=yes
-else
-  with_redirect=no
 fi
+eval ac_res=\$ax_cv_have_${ac_instr_acvar}_cpu_ext
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${with_redirect}" >&5
-$as_echo "${with_redirect}" >&6; }
-
-if test x"${with_redirect}" = x"yes"; then
-
-$as_echo "#define HAVE_REDIRECT 1" >>confdefs.h
-
-fi
- if test x"$with_redirect" = x"yes"; then
-  HAVE_REDIRECT_TRUE=
-  HAVE_REDIRECT_FALSE='#'
+         if test x"$(eval echo \$ax_cv_have_${ac_instr_acvar}_cpu_ext)" = x"yes"; then
+           { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${ac_instr_shortname} is supported by the processor and OS" >&5
+$as_echo_n "checking whether ${ac_instr_shortname} is supported by the processor and OS... " >&6; }
+if eval \${ax_cv_have_${ac_instr_acvar}_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-  HAVE_REDIRECT_TRUE='#'
-  HAVE_REDIRECT_FALSE=
-fi
-
 
-if test x"${with_ftimings}" = x"yes"; then
+             eval ax_cv_have_${ac_instr_acvar}_ext=no
+             if test x"${ac_instr_os_support}" = x"yes"; then
+               eval ax_cv_have_${ac_instr_acvar}_ext=yes
+             fi
 
-$as_echo "#define HAVE_DETAILED_TIMINGS 1" >>confdefs.h
-
-  # Check whether --enable-papi was given.
-if test "${enable_papi+set}" = set; then :
-  enableval=$enable_papi; want_papi=$enableval
-else
-  want_papi="auto"
 fi
+eval ac_res=\$ax_cv_have_${ac_instr_acvar}_ext
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
 
-  papi_found=unknown
-  if test x"$want_papi" != x"no" ; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for PAPI_library_init in -lpapi" >&5
-$as_echo_n "checking for PAPI_library_init in -lpapi... " >&6; }
-if ${ac_cv_lib_papi_PAPI_library_init+:} false; then :
+           if test "$(eval echo \$ax_cv_have_${ac_instr_acvar}_ext)" = yes; then
+             as_CACHEVAR=`$as_echo "ax_cv_check_cflags__${ac_instr_compiler_flags}" | $as_tr_sh`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts ${ac_instr_compiler_flags}" >&5
+$as_echo_n "checking whether C compiler accepts ${ac_instr_compiler_flags}... " >&6; }
+if eval \${$as_CACHEVAR+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lpapi  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  ${ac_instr_compiler_flags}"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char PAPI_library_init ();
 int
 main ()
 {
-return PAPI_library_init ();
+
   ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_papi_PAPI_library_init=yes
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$as_CACHEVAR=yes"
 else
-  ac_cv_lib_papi_PAPI_library_init=no
+  eval "$as_CACHEVAR=no"
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_papi_PAPI_library_init" >&5
-$as_echo "$ac_cv_lib_papi_PAPI_library_init" >&6; }
-if test "x$ac_cv_lib_papi_PAPI_library_init" = xyes; then :
-  papi_found="yes"
+eval ac_res=\$$as_CACHEVAR
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"; then :
+  eval ax_cv_support_${ac_instr_acvar}_ext=yes
 else
-  papi_found="no"
+  eval ax_cv_support_${ac_instr_acvar}_ext=no
 fi
 
-    if test x"$want_papi" = x"yes" ; then
-      if test x"$papi_found" = x"no" ; then
-        as_fn_error $? "\"Could not find usable PAPI installation, please adjust CFLAGS, LDFLAGS\"" "$LINENO" 5
-      fi
-    fi
-  fi
-  if test x"$papi_found" = x"yes"; then
+             if test x"$(eval echo \$ax_cv_support_${ac_instr_acvar}_ext)" = x"yes"; then
+               eval ${ac_instr_flag_type}=\"\$${ac_instr_flag_type} ${ac_instr_compiler_flags}\"
+               cat >>confdefs.h <<_ACEOF
+#define ${ac_instr_have_define} 1
+_ACEOF
 
-$as_echo "#define HAVE_LIBPAPI 1" >>confdefs.h
+             else
+               { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Your processor and OS supports ${ac_instr_shortname} instructions but not your compiler, can you try another compiler?" >&5
+$as_echo "$as_me: WARNING: Your processor and OS supports ${ac_instr_shortname} instructions but not your compiler, can you try another compiler?" >&2;}
+             fi
+           else
+             if test x"${ac_instr_os_support}" = x"no"; then
+               if eval \${ax_cv_support_${ac_instr_acvar}_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval ax_cv_support_${ac_instr_acvar}_ext=no
+fi
 
-    LIBS="-lpapi $LIBS"
-  fi
+               { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Your processor supports ${ac_instr_shortname}, but your OS doesn't" >&5
+$as_echo "$as_me: WARNING: Your processor supports ${ac_instr_shortname}, but your OS doesn't" >&2;}
+             fi
+           fi
+         else
+           if eval \${ax_cv_have_${ac_instr_acvar}_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval ax_cv_have_${ac_instr_acvar}_ext=no
 fi
- if test x"$with_ftimings" = x"yes"; then
-  HAVE_DETAILED_TIMINGS_TRUE=
-  HAVE_DETAILED_TIMINGS_FALSE='#'
+
+           if eval \${ax_cv_support_${ac_instr_acvar}_ext+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-  HAVE_DETAILED_TIMINGS_TRUE='#'
-  HAVE_DETAILED_TIMINGS_FALSE=
+  eval ax_cv_support_${ac_instr_acvar}_ext=no
 fi
 
+         fi
+      done
+  ;;
+  esac
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether SSE assembly kernel can be compiled" >&5
-$as_echo_n "checking whether SSE assembly kernel can be compiled... " >&6; }
 
-$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null
-if test "$?" == 0; then
-  can_compile_sse_assembly=yes
-  install_real_sse_assembly=yes
-  install_complex_sse_assembly=yes
-else
-  can_compile_sse_assembly=no
-  install_real_sse_assembly=no
-  install_complex_sse_assembly=no
-fi
-
-rm -f ./test.o
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_sse_assembly}" >&5
-$as_echo "${can_compile_sse_assembly}" >&6; }
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile SSE with gcc intrinsics in C" >&5
-$as_echo_n "checking whether we can compile SSE with gcc intrinsics in C... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
 
- #include <x86intrin.h>
- int main(int argc, char **argv){
- double* q;
- __m128d h1 = _mm_loaddup_pd(q);
- return 0;
- }
 
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  can_compile_sse_intrinsics=yes
-else
-  can_compile_sse_intrinsics=no
 
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_sse_intrinsics}" >&5
-$as_echo "${can_compile_sse_intrinsics}" >&6; }
 
-if test "${can_compile_sse_intrinsics}" = "yes"; then
-  install_real_sse_intrinsics=yes
-  install_real_sse_block2=yes
-  install_real_sse_block4=yes
-  install_real_sse_block6=yes
 
-  install_complex_sse_intrinsics=yes
-  install_complex_sse_block1=yes
-  install_complex_sse_block2=yes
-else
-  install_real_sse_intrinsics=no
-  install_real_sse_block2=no
-  install_real_sse_block4=no
-  install_real_sse_block6=no
 
-  install_complex_sse_intrinsics=no
-  install_complex_sse_block1=no
-  install_complex_sse_block2=no
-fi
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX intrinsics in C" >&5
-$as_echo_n "checking whether we can compile AVX intrinsics in C... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
 
- #include <x86intrin.h>
- int main(int argc, char **argv){
- double* q;
- __m256d a1_1 = _mm256_load_pd(q);
- return 0;
- }
 
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  can_compile_avx=yes
-else
-  can_compile_avx=no
 
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_avx}" >&5
-$as_echo "${can_compile_avx}" >&6; }
 
-#if test "${can_compile_avx}" = "yes" ; then
-#  AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
-#  AC_LANG_PUSH([C++])
-#  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-#   #include <x86intrin.h>
-#   int main(int argc, char **argv){
-#   double* q;
-#   __m256d a1_1 = _mm256_load_pd(q);
-#   return 0;
-#   }
-#   ])],
-#   [can_compile_avx=yes],
-#   [can_compile_avx=no]
-#  )
-#  AC_LANG_POP([C++])
-#  AC_MSG_RESULT([${can_compile_avx}])
-#  if test "${can_compile_avx}" = "no" ; then
-#    AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
-#  fi
-#fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 intrinsics in C" >&5
-$as_echo_n "checking whether we can compile AVX2 intrinsics in C... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
 
- #include <x86intrin.h>
- int main(int argc, char **argv){
- double* q;
- __m256d q1 = _mm256_load_pd(q);
- __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
- return 0;
- }
 
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  can_compile_avx2=yes
-else
-  can_compile_avx2=no
 
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_avx2}" >&5
-$as_echo "${can_compile_avx2}" >&6; }
-#if test "${can_compile_avx2}" = "yes" ; then
-#  AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
-#  AC_LANG_PUSH([C++])
-#  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-#   #include <x86intrin.h>
-#   int main(int argc, char **argv){
-#   double* q;
-#   __m256d q1 = _mm256_load_pd(q);
-#   __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
-#   return 0;
-#   }
-#   ])],
-#   [can_compile_avx2=yes],
-#   [can_compile_avx2=no]
-#  )
-#  AC_LANG_POP([C++])
-#  AC_MSG_RESULT([${can_compile_avx2}])
-#  if test "${can_compile_avx2}" = "no" ; then
-#    AC_MSG_WARN([Cannot compile C++ with AVX2!])
-#  fi
-#fi
 
-if test "${can_compile_avx}" = "yes" ; then
-  install_real_avx_block2=yes
-  install_real_avx_block4=yes
-  install_real_avx_block6=yes
 
-  install_complex_avx_block1=yes
-  install_complex_avx_block2=yes
-else
-  install_real_avx_block2=no
-  install_real_avx_block4=no
-  install_real_avx_block6=no
 
-  install_complex_avx_block1=no
-  install_complex_avx_block2=no
-fi
 
-if test "${can_compile_avx2}" = "yes" ; then
-  install_real_avx2_block2=yes
-  install_real_avx2_block4=yes
-  install_real_avx2_block6=yes
 
-  install_complex_avx2_block1=yes
-  install_complex_avx2_block2=yes
-else
-  install_real_avx2_block2=no
-  install_real_avx2_block4=no
-  install_real_avx2_block6=no
 
-  install_complex_avx2_block1=no
-  install_complex_avx2_block2=no
-fi
 
- if test x"$can_compile_sse_assembly" = x"yes"; then
-  HAVE_SSE_ASSEMBLY_TRUE=
-  HAVE_SSE_ASSEMBLY_FALSE='#'
-else
-  HAVE_SSE_ASSEMBLY_TRUE='#'
-  HAVE_SSE_ASSEMBLY_FALSE=
-fi
 
-if test x"${can_compile_sse_assembly}" = x"yes" ; then
 
-$as_echo "#define HAVE_SSE_ASSEMBLY 1" >>confdefs.h
 
-fi
- if test x"$can_compile_sse_intrinsics" = x"yes"; then
-  HAVE_SSE_INTRINSICS_TRUE=
-  HAVE_SSE_INTRINSICS_FALSE='#'
-else
-  HAVE_SSE_INTRINSICS_TRUE='#'
-  HAVE_SSE_INTRINSICS_FALSE=
-fi
 
-if test x"${can_compile_sse_intrinsics}" = x"yes" ; then
 
-$as_echo "#define HAVE_SSE_INTRINSICS 1" >>confdefs.h
 
-fi
 
- if test x"$can_compile_avx" = x"yes"; then
-  HAVE_AVX_TRUE=
-  HAVE_AVX_FALSE='#'
-else
-  HAVE_AVX_TRUE='#'
-  HAVE_AVX_FALSE=
-fi
 
-if test x"${can_compile_avx}" = x"yes" ; then
 
-$as_echo "#define HAVE_AVX 1" >>confdefs.h
 
-fi
- if test x"$can_compile_avx2" = x"yes"; then
-  HAVE_AVX2_TRUE=
-  HAVE_AVX2_FALSE='#'
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether heterogenous-cluster-support should be enabled" >&5
+$as_echo_n "checking whether heterogenous-cluster-support should be enabled... " >&6; }
+# Check whether --enable-heterogenous-cluster-support was given.
+if test "${enable_heterogenous_cluster_support+set}" = set; then :
+  enableval=$enable_heterogenous_cluster_support;
+	       if test x"$enableval" = x"yes"; then
+	         enable_heterogenous_cluster_support=yes
+	       else
+	         enable_heterogenous_cluster_support=no
+	       fi
+
 else
-  HAVE_AVX2_TRUE='#'
-  HAVE_AVX2_FALSE=
+  enable_heterogenous_cluster_support="no"
 fi
 
-if test x"${can_compile_avx2}" = x"yes" ; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $enable_heterogenous_cluster_support" >&5
+$as_echo "$enable_heterogenous_cluster_support" >&6; }
+if test x"${enable_heterogenous_cluster_support}" = x"yes"; then
 
-$as_echo "#define HAVE_AVX2 1" >>confdefs.h
+$as_echo "#define HAVE_HETEROGENOUS_CLUSTER_SUPPORT 1" >>confdefs.h
 
 fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether AVX optimization flags should be set automatically" >&5
-$as_echo_n "checking whether AVX optimization flags should be set automatically... " >&6; }
-
-# Check whether --with-avx-optimization was given.
-if test "${with_avx_optimization+set}" = set; then :
-  withval=$with_avx_optimization; with_avx_optimization=yes
+ if test x"$enable_heterogenous_cluster_support" = x"yes"; then
+  HAVE_HETEROGENOUS_CLUSTER_SUPPORT_TRUE=
+  HAVE_HETEROGENOUS_CLUSTER_SUPPORT_FALSE='#'
 else
-  with_avx_optimization=no
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${with_avx_optimization}" >&5
-$as_echo "${with_avx_optimization}" >&6; }
-if test x"${with_avx_optimization}" = x"yes"; then
- CFLAGS="$CFLAGS -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize"
- CXXFLAGS="$CXXFLAGS -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize"
+  HAVE_HETEROGENOUS_CLUSTER_SUPPORT_TRUE='#'
+  HAVE_HETEROGENOUS_CLUSTER_SUPPORT_FALSE=
 fi
 
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
 
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran flag needed to accept free-form source" >&5
-$as_echo_n "checking for Fortran flag needed to accept free-form source... " >&6; }
-if ${ac_cv_fc_freeform+:} false; then :
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
+$as_echo_n "checking how to run the C preprocessor... " >&6; }
+# On Suns, sometimes $CPP names a directory.
+if test -n "$CPP" && test -d "$CPP"; then
+  CPP=
+fi
+if test -z "$CPP"; then
+  if ${ac_cv_prog_CPP+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  ac_cv_fc_freeform=unknown
-ac_fc_freeform_FCFLAGS_save=$FCFLAGS
-for ac_flag in none -ffree-form -FR -free -qfree -Mfree -Mfreeform \
-	       -freeform "-f free" -8 +source=free -nfix --nfix -Free
+      # Double quotes because CPP needs to be expanded
+    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
 do
-  test "x$ac_flag" != xnone && FCFLAGS="$ac_fc_freeform_FCFLAGS_save $ac_flag"
-  cat > conftest.$ac_ext <<_ACEOF
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+  break
+fi
+
+    done
+    ac_cv_prog_CPP=$CPP
+
+fi
+  CPP=$ac_cv_prog_CPP
+else
+  ac_cv_prog_CPP=$CPP
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
+$as_echo "$CPP" >&6; }
+ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
+if ${ac_cv_path_GREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$GREP"; then
+  ac_path_GREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in grep ggrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_GREP" || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_GREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_GREP"; then
+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+$as_echo "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+$as_echo_n "checking for egrep... " >&6; }
+if ${ac_cv_path_EGREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     if test -z "$EGREP"; then
+  ac_path_EGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in egrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_EGREP" || continue
+# Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_EGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_EGREP"; then
+    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+   fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+$as_echo "$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
+$as_echo_n "checking for ANSI C header files... " >&6; }
+if ${ac_cv_header_stdc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_header_stdc=yes
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then :
+  :
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
+$as_echo "$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
+"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of long int" >&5
+$as_echo_n "checking size of long int... " >&6; }
+if ${ac_cv_sizeof_long_int+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (long int))" "ac_cv_sizeof_long_int"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_long_int" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (long int)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_long_int=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_long_int" >&5
+$as_echo "$ac_cv_sizeof_long_int" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_LONG_INT $ac_cv_sizeof_long_int
+_ACEOF
+
+
+size_of_long_int="${ac_cv_sizeof_long_int}"
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether 64bit integers should be used for math libraries (BLAS/LAPACK/SCALAPACK)" >&5
+$as_echo_n "checking whether 64bit integers should be used for math libraries (BLAS/LAPACK/SCALAPACK)... " >&6; }
+# Check whether --enable-64bit-integer-math-support was given.
+if test "${enable_64bit_integer_math_support+set}" = set; then :
+  enableval=$enable_64bit_integer_math_support;
+	       if test x"$enableval" = x"yes"; then
+	         enable_64bit_integer_math_support=yes
+	       else
+	         enable_64bit_integer_math_support=no
+	       fi
+
+else
+  enable_64bit_integer_math_support="no"
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $enable_64bit_integer_math_support" >&5
+$as_echo "$enable_64bit_integer_math_support" >&6; }
+if test x"${enable_64bit_integer_math_support}" = x"yes"; then
+    if test x"${enable_c_tests}" = x"yes"; then
+     as_fn_error $? "You cannot both define 64bit integer support and C tests. Reconfigure!" "$LINENO" 5
+  fi
+    if test x"${size_of_long_int}" = x"8"; then
+    echo "Found C data-type \"long int\" with 8 bytes"
+  else
+    as_fn_error $? "The C data-type \"long int\" is only ${size_of_long_int} bytes; Needed is 8 bytes" "$LINENO" 5
+  fi
+
+
+$as_echo "#define HAVE_64BIT_INTEGER_MATH_SUPPORT 1" >>confdefs.h
+
+fi
+ if test x"$enable_64bit_integer_math_support" = x"yes"; then
+  HAVE_64BIT_INTEGER_MATH_SUPPORT_TRUE=
+  HAVE_64BIT_INTEGER_MATH_SUPPORT_FALSE='#'
+else
+  HAVE_64BIT_INTEGER_MATH_SUPPORT_TRUE='#'
+  HAVE_64BIT_INTEGER_MATH_SUPPORT_FALSE=
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether 64bit integers should be used for the MPI library" >&5
+$as_echo_n "checking whether 64bit integers should be used for the MPI library... " >&6; }
+# Check whether --enable-64bit-integer-mpi-support was given.
+if test "${enable_64bit_integer_mpi_support+set}" = set; then :
+  enableval=$enable_64bit_integer_mpi_support;
+	       if test x"$enableval" = x"yes"; then
+	         enable_64bit_integer_mpi_support=yes
+	       else
+	         enable_64bit_integer_mpi_support=no
+	       fi
+
+else
+  enable_64bit_integer_mpi_support="no"
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $enable_64bit_integer_mpi_support" >&5
+$as_echo "$enable_64bit_integer_mpi_support" >&6; }
+if test x"${enable_64bit_integer_mpi_support}" = x"yes"; then
+
+$as_echo "#define HAVE_64BIT_INTEGER_MPI_SUPPORT 1" >>confdefs.h
+
+fi
+ if test x"$enable_64bit_integer_mpi_support" = x"yes"; then
+  HAVE_64BIT_INTEGER_MPI_SUPPORT_TRUE=
+  HAVE_64BIT_INTEGER_MPI_SUPPORT_FALSE='#'
+else
+  HAVE_64BIT_INTEGER_MPI_SUPPORT_TRUE='#'
+  HAVE_64BIT_INTEGER_MPI_SUPPORT_FALSE=
+fi
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler can use _Generic " >&5
+$as_echo_n "checking whether C compiler can use _Generic ... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int main(int argc, char **argv) {
+#define elpa_set(e, name, value, error) _Generic((value), \
+                int: \
+                  elpa_set_integer, \
+                \
+                double: \
+                  elpa_set_double \
+        )(e, name, value, error)
+
+  return 0;
+}
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_generic=yes
+else
+  can_compile_generic=no
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_generic}" >&5
+$as_echo "${can_compile_generic}" >&6; }
+if test x"$can_compile_generic" != x"yes"; then
+  as_fn_error $? "C compiler cannot handle _Generic statement! Upgrade or change C compiler" "$LINENO" 5
+fi
+
+$as_echo "#define HAVE_VSX_SSE 1" >>confdefs.h
+
+
+
+if test -n "$ac_tool_prefix"; then
+  for ac_prog in ar lib "link -lib"
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AR+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AR"; then
+  ac_cv_prog_AR="$AR" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AR="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AR=$ac_cv_prog_AR
+if test -n "$AR"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AR" >&5
+$as_echo "$AR" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$AR" && break
+  done
+fi
+if test -z "$AR"; then
+  ac_ct_AR=$AR
+  for ac_prog in ar lib "link -lib"
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_AR+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_AR"; then
+  ac_cv_prog_ac_ct_AR="$ac_ct_AR" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_AR="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_AR=$ac_cv_prog_ac_ct_AR
+if test -n "$ac_ct_AR"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AR" >&5
+$as_echo "$ac_ct_AR" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_AR" && break
+done
+
+  if test "x$ac_ct_AR" = x; then
+    AR="false"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    AR=$ac_ct_AR
+  fi
+fi
+
+: ${AR=ar}
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking the archiver ($AR) interface" >&5
+$as_echo_n "checking the archiver ($AR) interface... " >&6; }
+if ${am_cv_ar_interface+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+   am_cv_ar_interface=ar
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+int some_variable = 0;
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  am_ar_try='$AR cru libconftest.a conftest.$ac_objext >&5'
+      { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$am_ar_try\""; } >&5
+  (eval $am_ar_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+      if test "$ac_status" -eq 0; then
+        am_cv_ar_interface=ar
+      else
+        am_ar_try='$AR -NOLOGO -OUT:conftest.lib conftest.$ac_objext >&5'
+        { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$am_ar_try\""; } >&5
+  (eval $am_ar_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+        if test "$ac_status" -eq 0; then
+          am_cv_ar_interface=lib
+        else
+          am_cv_ar_interface=unknown
+        fi
+      fi
+      rm -f conftest.lib libconftest.a
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_ar_interface" >&5
+$as_echo "$am_cv_ar_interface" >&6; }
+
+case $am_cv_ar_interface in
+ar)
+  ;;
+lib)
+  # Microsoft lib, so override with the ar-lib wrapper script.
+  # FIXME: It is wrong to rewrite AR.
+  # But if we don't then we get into trouble of one sort or another.
+  # A longer-term fix would be to have automake use am__AR in this case,
+  # and then we could set am__AR="$am_aux_dir/ar-lib \$(AR)" or something
+  # similar.
+  AR="$am_aux_dir/ar-lib $AR"
+  ;;
+unknown)
+  as_fn_error $? "could not determine $AR interface" "$LINENO" 5
+  ;;
+esac
+
+# By default we simply use the C compiler to build assembly code.
+
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+
+
+
+depcc="$CCAS"   am_compiler_list=
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CCAS_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CCAS_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CCAS_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CCAS_dependencies_compiler_type=none
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; }
+CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CCAS_dependencies_compiler_type" = gcc3; then
+  am__fastdepCCAS_TRUE=
+  am__fastdepCCAS_FALSE='#'
+else
+  am__fastdepCCAS_TRUE='#'
+  am__fastdepCCAS_FALSE=
+fi
+
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5
+$as_echo_n "checking for $CC option to accept ISO C99... " >&6; }
+if ${ac_cv_prog_cc_c99+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c99=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include <stdio.h>
+
+// Check varargs macros.  These examples are taken from C99 6.10.3.5.
+#define debug(...) fprintf (stderr, __VA_ARGS__)
+#define showlist(...) puts (#__VA_ARGS__)
+#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__))
+static void
+test_varargs_macros (void)
+{
+  int x = 1234;
+  int y = 5678;
+  debug ("Flag");
+  debug ("X = %d\n", x);
+  showlist (The first, second, and third items.);
+  report (x>y, "x is %d but y is %d", x, y);
+}
+
+// Check long long types.
+#define BIG64 18446744073709551615ull
+#define BIG32 4294967295ul
+#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0)
+#if !BIG_OK
+  your preprocessor is broken;
+#endif
+#if BIG_OK
+#else
+  your preprocessor is broken;
+#endif
+static long long int bignum = -9223372036854775807LL;
+static unsigned long long int ubignum = BIG64;
+
+struct incomplete_array
+{
+  int datasize;
+  double data[];
+};
+
+struct named_init {
+  int number;
+  const wchar_t *name;
+  double average;
+};
+
+typedef const char *ccp;
+
+static inline int
+test_restrict (ccp restrict text)
+{
+  // See if C++-style comments work.
+  // Iterate through items via the restricted pointer.
+  // Also check for declarations in for loops.
+  for (unsigned int i = 0; *(text+i) != '\0'; ++i)
+    continue;
+  return 0;
+}
+
+// Check varargs and va_copy.
+static void
+test_varargs (const char *format, ...)
+{
+  va_list args;
+  va_start (args, format);
+  va_list args_copy;
+  va_copy (args_copy, args);
+
+  const char *str;
+  int number;
+  float fnumber;
+
+  while (*format)
+    {
+      switch (*format++)
+	{
+	case 's': // string
+	  str = va_arg (args_copy, const char *);
+	  break;
+	case 'd': // int
+	  number = va_arg (args_copy, int);
+	  break;
+	case 'f': // float
+	  fnumber = va_arg (args_copy, double);
+	  break;
+	default:
+	  break;
+	}
+    }
+  va_end (args_copy);
+  va_end (args);
+}
+
+int
+main ()
+{
+
+  // Check bool.
+  _Bool success = false;
+
+  // Check restrict.
+  if (test_restrict ("String literal") == 0)
+    success = true;
+  char *restrict newvar = "Another string";
+
+  // Check varargs.
+  test_varargs ("s, d' f .", "string", 65, 34.234);
+  test_varargs_macros ();
+
+  // Check flexible array members.
+  struct incomplete_array *ia =
+    malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10));
+  ia->datasize = 10;
+  for (int i = 0; i < ia->datasize; ++i)
+    ia->data[i] = i * 1.234;
+
+  // Check named initializers.
+  struct named_init ni = {
+    .number = 34,
+    .name = L"Test wide string",
+    .average = 543.34343,
+  };
+
+  ni.number = 58;
+
+  int dynamic_array[ni.number];
+  dynamic_array[ni.number - 1] = 543;
+
+  // work around unused variable warnings
+  return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x'
+	  || dynamic_array[ni.number - 1] != 543);
+
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -D_STDC_C99= -qlanglvl=extc99
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c99=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c99" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c99" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c99"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5
+$as_echo "$ac_cv_prog_cc_c99" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c99" != xno; then :
+
+fi
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+# Fortran
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_prog_fc_mpi.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_FC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]])
+#
+# DESCRIPTION
+#
+#   This macro tries to find out how to compile Fortran77 programs that use
+#   MPI (Message Passing Interface), a standard API for parallel process
+#   communication (see http://www-unix.mcs.anl.gov/mpi/).  The macro has to
+#   be used instead of the standard macro AC_PROG_FC and will replace the
+#   standard variable FC with the found compiler.
+#
+#   MPI-WANTED-TEST is used to test whether MPI is actually wanted by the
+#   user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will
+#   try to find out how to use MPI, if it fails, the macro will call
+#   AC_PROG_CC to find a standard C compiler instead.
+#
+#   When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found
+#   (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If
+#   ACTION-IF-FOUND is not set, the macro will define HAVE_MPI.
+#
+#   The following example demonstrates usage of the macro:
+#
+#     # If --with-mpi=auto is used, try to find MPI, but use standard FC compiler if it is not found.
+#     # If --with-mpi=yes is used, try to find MPI and fail if it isn't found.
+#     # If --with-mpi=no is used, use a standard FC compiler instead.
+#     AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi],
+#         [compile with MPI (parallelization) support. If none is found,
+#         MPI is not used. Default: auto])
+#     ],,[with_mpi=auto])
+#
+#     AX_PROG_FC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[
+#       use_mpi=no
+#       if test x"$with_mpi" = xyes; then
+#         AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.])
+#       else
+#         AC_MSG_WARN([No MPI compiler found, won't use MPI.])
+#       fi
+#     ])
+#
+# LICENSE
+#
+#   Copyright (c) 2010,2011 Olaf Lenz <olenz@icp.uni-stuttgart.de>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 2
+
+
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to compile using MPI" >&5
+$as_echo_n "checking whether to compile using MPI... " >&6; }
+    if test x"$with_mpi" = x"yes"; then
+      _ax_prog_fc_mpi_mpi_wanted=yes
+    else
+      _ax_prog_fc_mpi_mpi_wanted=no
+    fi
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_ax_prog_fc_mpi_mpi_wanted" >&5
+$as_echo "$_ax_prog_fc_mpi_mpi_wanted" >&6; }
+
+  if test x"$_ax_prog_fc_mpi_mpi_wanted" = xyes; then
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_FC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$FC"; then
+  ac_cv_prog_FC="$FC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_FC="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+FC=$ac_cv_prog_FC
+if test -n "$FC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $FC" >&5
+$as_echo "$FC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$FC" && break
+  done
+fi
+if test -z "$FC"; then
+  ac_ct_FC=$FC
+  for ac_prog in mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_FC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_FC"; then
+  ac_cv_prog_ac_ct_FC="$ac_ct_FC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_FC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_FC=$ac_cv_prog_ac_ct_FC
+if test -n "$ac_ct_FC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_FC" >&5
+$as_echo "$ac_ct_FC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_FC" && break
+done
+
+  if test "x$ac_ct_FC" = x; then
+    FC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    FC=$ac_ct_FC
+  fi
+fi
+
+  fi
+  ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  for ac_prog in gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor xlf90 f90 pgf90 pghpf epcf90 g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_FC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$FC"; then
+  ac_cv_prog_FC="$FC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_FC="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+FC=$ac_cv_prog_FC
+if test -n "$FC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $FC" >&5
+$as_echo "$FC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$FC" && break
+  done
+fi
+if test -z "$FC"; then
+  ac_ct_FC=$FC
+  for ac_prog in gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor xlf90 f90 pgf90 pghpf epcf90 g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_FC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_FC"; then
+  ac_cv_prog_ac_ct_FC="$ac_ct_FC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_FC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_FC=$ac_cv_prog_ac_ct_FC
+if test -n "$ac_ct_FC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_FC" >&5
+$as_echo "$ac_ct_FC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_FC" && break
+done
+
+  if test "x$ac_ct_FC" = x; then
+    FC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    FC=$ac_ct_FC
+  fi
+fi
+
+
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+rm -f a.out
+
+# If we don't use `.F' as extension, the preprocessor is not run on the
+# input file.  (Note that this only needs to work for GNU compilers.)
+ac_save_ext=$ac_ext
+ac_ext=F
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU Fortran compiler" >&5
+$as_echo_n "checking whether we are using the GNU Fortran compiler... " >&6; }
+if ${ac_cv_fc_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+#ifndef __GNUC__
+       choke me
+#endif
+
+      end
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_fc_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_compiler_gnu" >&5
+$as_echo "$ac_cv_fc_compiler_gnu" >&6; }
+ac_ext=$ac_save_ext
+ac_test_FCFLAGS=${FCFLAGS+set}
+ac_save_FCFLAGS=$FCFLAGS
+FCFLAGS=
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $FC accepts -g" >&5
+$as_echo_n "checking whether $FC accepts -g... " >&6; }
+if ${ac_cv_prog_fc_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  FCFLAGS=-g
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  ac_cv_prog_fc_g=yes
+else
+  ac_cv_prog_fc_g=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_g" >&5
+$as_echo "$ac_cv_prog_fc_g" >&6; }
+if test "$ac_test_FCFLAGS" = set; then
+  FCFLAGS=$ac_save_FCFLAGS
+elif test $ac_cv_prog_fc_g = yes; then
+  if test "x$ac_cv_fc_compiler_gnu" = xyes; then
+    FCFLAGS="-g -O2"
+  else
+    FCFLAGS="-g"
+  fi
+else
+  if test "x$ac_cv_fc_compiler_gnu" = xyes; then
+    FCFLAGS="-O2"
+  else
+    FCFLAGS=
+  fi
+fi
+
+if test $ac_compiler_gnu = yes; then
+  GFC=yes
+else
+  GFC=
+fi
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+
+
+
+
+# Check for compiler
+# Needs to be split off into an extra macro to ensure right expansion
+# order.
+
+
+if test x"$_ax_prog_fc_mpi_mpi_wanted" = xno; then :
+   _ax_prog_fc_mpi_mpi_found=no
+else
+
+    ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+    # test whether MPI_INIT is available
+    # We do not use AC_SEARCH_LIBS here, as it caches its outcome and
+    # thus disallows corresponding calls in the other AX_PROG_*_MPI
+    # macros.
+    for lib in NONE mpichf90 fmpi fmpich; do
+      save_LIBS=$LIBS
+      if test x"$lib" = xNONE; then
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for function MPI_INIT" >&5
+$as_echo_n "checking for function MPI_INIT... " >&6; }
+      else
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for function MPI_INIT in -l$lib" >&5
+$as_echo_n "checking for function MPI_INIT in -l$lib... " >&6; }
+        LIBS="-l$lib $LIBS"
+      fi
+      cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call MPI_INIT
+      end
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+   _ax_prog_fc_mpi_mpi_found=yes
+else
+   _ax_prog_fc_mpi_mpi_found=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_ax_prog_fc_mpi_mpi_found" >&5
+$as_echo "$_ax_prog_fc_mpi_mpi_found" >&6; }
+      if test "x$_ax_prog_fc_mpi_mpi_found" = "xyes"; then
+        break;
+      fi
+      LIBS=$save_LIBS
+    done
+
+    # Check for header
+    if test x"$_ax_prog_fc_mpi_mpi_found" = xyes; then :
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for mpif.h" >&5
+$as_echo_n "checking for mpif.h... " >&6; }
+      cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      include 'mpif.h'
+
+      end
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+   { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	  _ax_prog_fc_mpi_mpi_found=no
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+fi
+    ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+fi
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$_ax_prog_fc_mpi_mpi_found" = xyes; then :
+
+        found_mpi_f=yes
+        :
+
+else
+
+        found_mpi_f=no
+        :
+
+fi
+
+
+if test x"$with_mpi" = x"yes"; then
+  if test x"$found_mpi_f" = x"no"; then
+    as_fn_error $? "Could not compile an MPI Fortran program" "$LINENO" 5
+  fi
+fi
+
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran flag to compile .F90 files" >&5
+$as_echo_n "checking for Fortran flag to compile .F90 files... " >&6; }
+if ${ac_cv_fc_srcext_F90+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_ext=F90
+ac_fcflags_srcext_save=$ac_fcflags_srcext
+ac_fcflags_srcext=
+ac_cv_fc_srcext_F90=unknown
+case $ac_ext in #(
+  [fF]77) ac_try=f77;; #(
+  *) ac_try=f95;;
+esac
+for ac_flag in none -qsuffix=f=F90 -Tf "-x $ac_try"; do
+  test "x$ac_flag" != xnone && ac_fcflags_srcext="$ac_flag"
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  ac_cv_fc_srcext_F90=$ac_flag; break
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+rm -f conftest.$ac_objext conftest.F90
+ac_fcflags_srcext=$ac_fcflags_srcext_save
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_srcext_F90" >&5
+$as_echo "$ac_cv_fc_srcext_F90" >&6; }
+if test "x$ac_cv_fc_srcext_F90" = xunknown; then
+  as_fn_error $? "Fortran could not compile .F90 files" "$LINENO" 5
+else
+  ac_fc_srcext=F90
+  if test "x$ac_cv_fc_srcext_F90" = xnone; then
+    ac_fcflags_srcext=""
+    FCFLAGS_F90=""
+  else
+    ac_fcflags_srcext=$ac_cv_fc_srcext_F90
+    FCFLAGS_F90=$ac_cv_fc_srcext_F90
+  fi
+
+
+fi
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran flag needed to accept free-form source" >&5
+$as_echo_n "checking for Fortran flag needed to accept free-form source... " >&6; }
+if ${ac_cv_fc_freeform+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_fc_freeform=unknown
+ac_fc_freeform_FCFLAGS_save=$FCFLAGS
+for ac_flag in none -ffree-form -FR -free -qfree -Mfree -Mfreeform \
+	       -freeform "-f free" -8 +source=free -nfix --nfix -Free
+do
+  test "x$ac_flag" != xnone && FCFLAGS="$ac_fc_freeform_FCFLAGS_save $ac_flag"
+  cat > conftest.$ac_ext <<_ACEOF
 
   program freeform
        ! FIXME: how to best confuse non-freeform compilers?
@@ -5989,3433 +7891,4090 @@
        end
 _ACEOF
 if ac_fn_fc_try_compile "$LINENO"; then :
-  ac_cv_fc_freeform=$ac_flag; break
+  ac_cv_fc_freeform=$ac_flag; break
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
+FCFLAGS=$ac_fc_freeform_FCFLAGS_save
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_freeform" >&5
+$as_echo "$ac_cv_fc_freeform" >&6; }
+if test "x$ac_cv_fc_freeform" = xunknown; then
+  as_fn_error 77 "Fortran does not accept free-form source" "$LINENO" 5
+else
+  if test "x$ac_cv_fc_freeform" != xnone; then
+    FCFLAGS="$FCFLAGS $ac_cv_fc_freeform"
+  fi
+
+fi
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking Fortran 90 module inclusion flag" >&5
+$as_echo_n "checking Fortran 90 module inclusion flag... " >&6; }
+if ${ac_cv_fc_module_flag+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+ac_cv_fc_module_flag=unknown
+mkdir conftest.dir
+cd conftest.dir
+cat > conftest.$ac_ext <<_ACEOF
+
+      module conftest_module
+      contains
+      subroutine conftest_routine
+      write(*,'(a)') 'gotcha!'
+      end subroutine
+      end module
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  cd ..
+   ac_fc_module_flag_FCFLAGS_save=$FCFLAGS
+   # Flag ordering is significant for gfortran and Sun.
+   for ac_flag in -M -I '-I ' '-M ' -p '-mod ' '-module ' '-Am -I'; do
+     # Add the flag twice to prevent matching an output flag.
+     FCFLAGS="$ac_fc_module_flag_FCFLAGS_save ${ac_flag}conftest.dir ${ac_flag}conftest.dir"
+     cat > conftest.$ac_ext <<_ACEOF
+
+      program main
+      use conftest_module
+      call conftest_routine
+      end program
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  ac_cv_fc_module_flag="$ac_flag"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+     if test "$ac_cv_fc_module_flag" != unknown; then
+       break
+     fi
+   done
+   FCFLAGS=$ac_fc_module_flag_FCFLAGS_save
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+rm -rf conftest.dir
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_module_flag" >&5
+$as_echo "$ac_cv_fc_module_flag" >&6; }
+if test "$ac_cv_fc_module_flag" != unknown; then
+  FC_MODINC=$ac_cv_fc_module_flag
+
+else
+  FC_MODINC=
+  as_fn_error $? "unable to find compiler flag for module search path" "$LINENO" 5
+fi
+
+# Ensure trailing whitespace is preserved in a Makefile.
+ac_empty=""
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking Fortran 90 module output flag" >&5
+$as_echo_n "checking Fortran 90 module output flag... " >&6; }
+if ${ac_cv_fc_module_output_flag+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+mkdir conftest.dir conftest.dir/sub
+cd conftest.dir
+ac_cv_fc_module_output_flag=unknown
+ac_fc_module_output_flag_FCFLAGS_save=$FCFLAGS
+# Flag ordering is significant: put flags late which some compilers use
+# for the search path.
+for ac_flag in -J '-J ' -fmod= -moddir= +moddir= -qmoddir= '-mod ' \
+	      '-module ' -M '-Am -M' '-e m -J '; do
+  FCFLAGS="$ac_fc_module_output_flag_FCFLAGS_save ${ac_flag}sub"
+  cat > conftest.$ac_ext <<_ACEOF
+
+      module conftest_module
+      contains
+      subroutine conftest_routine
+      write(*,'(a)') 'gotcha!'
+      end subroutine
+      end module
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  cd sub
+     cat > conftest.$ac_ext <<_ACEOF
+
+      program main
+      use conftest_module
+      call conftest_routine
+      end program
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  ac_cv_fc_module_output_flag="$ac_flag"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+     cd ..
+     if test "$ac_cv_fc_module_output_flag" != unknown; then
+       break
+     fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+FCFLAGS=$ac_fc_module_output_flag_FCFLAGS_save
+cd ..
+rm -rf conftest.dir
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_module_output_flag" >&5
+$as_echo "$ac_cv_fc_module_output_flag" >&6; }
+if test "$ac_cv_fc_module_output_flag" != unknown; then
+  FC_MODOUT=$ac_cv_fc_module_output_flag
+
+else
+  FC_MODOUT=
+  as_fn_error $? "unable to find compiler flag to write module information to" "$LINENO" 5
+fi
+
+# Ensure trailing whitespace is preserved in a Makefile.
+ac_empty=""
+
+
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to get verbose linking output from $FC" >&5
+$as_echo_n "checking how to get verbose linking output from $FC... " >&6; }
+if ${ac_cv_prog_fc_v+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  ac_cv_prog_fc_v=
+# Try some options frequently used verbose output
+for ac_verb in -v -verbose --verbose -V -\#\#\#; do
+  cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+
+# Compile and link our simple test program by passing a flag (argument
+# 1 to this macro) to the Fortran compiler in order to get
+# "verbose" output that we can then parse for the Fortran linker
+# flags.
+ac_save_FCFLAGS=$FCFLAGS
+FCFLAGS="$FCFLAGS $ac_verb"
+eval "set x $ac_link"
+shift
+$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5
+# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH,
+# LIBRARY_PATH; skip all such settings.
+ac_fc_v_output=`eval $ac_link 5>&1 2>&1 |
+  sed '/^Driving:/d; /^Configured with:/d;
+      '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"`
+$as_echo "$ac_fc_v_output" >&5
+FCFLAGS=$ac_save_FCFLAGS
+
+rm -rf conftest*
+
+# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where
+# /foo, /bar, and /baz are search directories for the Fortran linker.
+# Here, we change these into -L/foo -L/bar -L/baz (and put it first):
+ac_fc_v_output="`echo $ac_fc_v_output |
+	grep 'LPATH is:' |
+	sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_fc_v_output"
+
+# FIXME: we keep getting bitten by quoted arguments; a more general fix
+#        that detects unbalanced quotes in FLIBS should be implemented
+#        and (ugh) tested at some point.
+case $ac_fc_v_output in
+  # With xlf replace commas with spaces,
+  # and remove "-link" and closing parenthesis.
+  *xlfentry*)
+    ac_fc_v_output=`echo $ac_fc_v_output |
+      sed '
+        s/,/ /g
+        s/ -link / /g
+        s/) *$//
+      '
+    ` ;;
+
+  # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted
+  # $LIBS confuse us, and the libraries appear later in the output anyway).
+  *mGLOB_options_string*)
+    ac_fc_v_output=`echo $ac_fc_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;;
+
+  # Portland Group compiler has singly- or doubly-quoted -cmdline argument
+  # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4.
+  # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2".
+  *-cmdline\ * | *-ignore\ * | *-def\ *)
+    ac_fc_v_output=`echo $ac_fc_v_output | sed "\
+	s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
+	s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
+	s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
+
+  # If we are using fort77 (the f2c wrapper) then filter output and delete quotes.
+  *fort77*f2c*gcc*)
+    ac_fc_v_output=`echo "$ac_fc_v_output" | sed -n '
+        /:[	 ]\+Running[	 ]\{1,\}"gcc"/{
+          /"-c"/d
+          /[.]c"*/d
+          s/^.*"gcc"/"gcc"/
+          s/"//gp
+        }'` ;;
+
+  # If we are using Cray Fortran then delete quotes.
+  *cft90*)
+    ac_fc_v_output=`echo $ac_fc_v_output | sed 's/"//g'` ;;
+esac
+
+
+  # look for -l* and *.a constructs in the output
+  for ac_arg in $ac_fc_v_output; do
+     case $ac_arg in
+	[\\/]*.a | ?:[\\/]*.a | -[lLRu]*)
+	  ac_cv_prog_fc_v=$ac_verb
+	  break 2 ;;
+     esac
+  done
+done
+if test -z "$ac_cv_prog_fc_v"; then
+   { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot determine how to obtain linking information from $FC" >&5
+$as_echo "$as_me: WARNING: cannot determine how to obtain linking information from $FC" >&2;}
+fi
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: compilation failed" >&5
+$as_echo "$as_me: WARNING: compilation failed" >&2;}
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_v" >&5
+$as_echo "$ac_cv_prog_fc_v" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran libraries of $FC" >&5
+$as_echo_n "checking for Fortran libraries of $FC... " >&6; }
+if ${ac_cv_fc_libs+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "x$FCLIBS" != "x"; then
+  ac_cv_fc_libs="$FCLIBS" # Let the user override the test.
+else
+
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+
+      end
+_ACEOF
+
+# Compile and link our simple test program by passing a flag (argument
+# 1 to this macro) to the Fortran compiler in order to get
+# "verbose" output that we can then parse for the Fortran linker
+# flags.
+ac_save_FCFLAGS=$FCFLAGS
+FCFLAGS="$FCFLAGS $ac_cv_prog_fc_v"
+eval "set x $ac_link"
+shift
+$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5
+# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH,
+# LIBRARY_PATH; skip all such settings.
+ac_fc_v_output=`eval $ac_link 5>&1 2>&1 |
+  sed '/^Driving:/d; /^Configured with:/d;
+      '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"`
+$as_echo "$ac_fc_v_output" >&5
+FCFLAGS=$ac_save_FCFLAGS
+
+rm -rf conftest*
+
+# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where
+# /foo, /bar, and /baz are search directories for the Fortran linker.
+# Here, we change these into -L/foo -L/bar -L/baz (and put it first):
+ac_fc_v_output="`echo $ac_fc_v_output |
+	grep 'LPATH is:' |
+	sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_fc_v_output"
+
+# FIXME: we keep getting bitten by quoted arguments; a more general fix
+#        that detects unbalanced quotes in FLIBS should be implemented
+#        and (ugh) tested at some point.
+case $ac_fc_v_output in
+  # With xlf replace commas with spaces,
+  # and remove "-link" and closing parenthesis.
+  *xlfentry*)
+    ac_fc_v_output=`echo $ac_fc_v_output |
+      sed '
+        s/,/ /g
+        s/ -link / /g
+        s/) *$//
+      '
+    ` ;;
+
+  # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted
+  # $LIBS confuse us, and the libraries appear later in the output anyway).
+  *mGLOB_options_string*)
+    ac_fc_v_output=`echo $ac_fc_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;;
+
+  # Portland Group compiler has singly- or doubly-quoted -cmdline argument
+  # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4.
+  # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2".
+  *-cmdline\ * | *-ignore\ * | *-def\ *)
+    ac_fc_v_output=`echo $ac_fc_v_output | sed "\
+	s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
+	s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
+	s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
+
+  # If we are using fort77 (the f2c wrapper) then filter output and delete quotes.
+  *fort77*f2c*gcc*)
+    ac_fc_v_output=`echo "$ac_fc_v_output" | sed -n '
+        /:[	 ]\+Running[	 ]\{1,\}"gcc"/{
+          /"-c"/d
+          /[.]c"*/d
+          s/^.*"gcc"/"gcc"/
+          s/"//gp
+        }'` ;;
+
+  # If we are using Cray Fortran then delete quotes.
+  *cft90*)
+    ac_fc_v_output=`echo $ac_fc_v_output | sed 's/"//g'` ;;
+esac
+
+
+
+ac_cv_fc_libs=
+
+# Save positional arguments (if any)
+ac_save_positional="$@"
+
+set X $ac_fc_v_output
+while test $# != 1; do
+  shift
+  ac_arg=$1
+  case $ac_arg in
+	[\\/]*.a | ?:[\\/]*.a)
+	    ac_exists=false
+  for ac_i in $ac_cv_fc_libs; do
+    if test x"$ac_arg" = x"$ac_i"; then
+      ac_exists=true
+      break
+    fi
+  done
+
+  if test x"$ac_exists" = xtrue; then :
+
+else
+  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
+fi
+	  ;;
+	-bI:*)
+	    ac_exists=false
+  for ac_i in $ac_cv_fc_libs; do
+    if test x"$ac_arg" = x"$ac_i"; then
+      ac_exists=true
+      break
+    fi
+  done
+
+  if test x"$ac_exists" = xtrue; then :
+
+else
+  if test "$ac_compiler_gnu" = yes; then
+  for ac_link_opt in $ac_arg; do
+    ac_cv_fc_libs="$ac_cv_fc_libs -Xlinker $ac_link_opt"
+  done
+else
+  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
+fi
+fi
+	  ;;
+	  # Ignore these flags.
+	-lang* | -lcrt*.o | -lc | -lgcc* | -lSystem | -libmil | -little \
+	  |-LANG:=* | -LIST:* | -LNO:* | -link)
+	  ;;
+	-lkernel32)
+	  case $host_os in
+	  *cygwin*) ;;
+	  *) ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
+	    ;;
+	  esac
+	  ;;
+	-[LRuYz])
+	  # These flags, when seen by themselves, take an argument.
+	  # We remove the space between option and argument and re-iterate
+	  # unless we find an empty arg or a new option (starting with -)
+	  case $2 in
+	     "" | -*);;
+	     *)
+		ac_arg="$ac_arg$2"
+		shift; shift
+		set X $ac_arg "$@"
+		;;
+	  esac
+	  ;;
+	-YP,*)
+	  for ac_j in `$as_echo "$ac_arg" | sed -e 's/-YP,/-L/;s/:/ -L/g'`; do
+	      ac_exists=false
+  for ac_i in $ac_cv_fc_libs; do
+    if test x"$ac_j" = x"$ac_i"; then
+      ac_exists=true
+      break
+    fi
+  done
+
+  if test x"$ac_exists" = xtrue; then :
+
+else
+  ac_arg="$ac_arg $ac_j"
+			       ac_cv_fc_libs="$ac_cv_fc_libs $ac_j"
+fi
+	  done
+	  ;;
+	-[lLR]*)
+	    ac_exists=false
+  for ac_i in $ac_cv_fc_libs; do
+    if test x"$ac_arg" = x"$ac_i"; then
+      ac_exists=true
+      break
+    fi
+  done
+
+  if test x"$ac_exists" = xtrue; then :
+
+else
+  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
+fi
+	  ;;
+	-zallextract*| -zdefaultextract)
+	  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
+	  ;;
+	  # Ignore everything else.
+  esac
+done
+# restore positional arguments
+set X $ac_save_positional; shift
+
+# We only consider "LD_RUN_PATH" on Solaris systems.  If this is seen,
+# then we insist that the "run path" must be an absolute path (i.e. it
+# must begin with a "/").
+case `(uname -sr) 2>/dev/null` in
+   "SunOS 5"*)
+      ac_ld_run_path=`$as_echo "$ac_fc_v_output" |
+			sed -n 's,^.*LD_RUN_PATH *= *\(/[^ ]*\).*$,-R\1,p'`
+      test "x$ac_ld_run_path" != x &&
+	if test "$ac_compiler_gnu" = yes; then
+  for ac_link_opt in $ac_ld_run_path; do
+    ac_cv_fc_libs="$ac_cv_fc_libs -Xlinker $ac_link_opt"
+  done
+else
+  ac_cv_fc_libs="$ac_cv_fc_libs $ac_ld_run_path"
+fi
+      ;;
+esac
+fi # test "x$[]_AC_LANG_PREFIX[]LIBS" = "x"
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_libs" >&5
+$as_echo "$ac_cv_fc_libs" >&6; }
+FCLIBS="$ac_cv_fc_libs"
+
+
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+
+if test x"${enable_openmp}" = x"yes"; then
+
+  OPENMP_FCFLAGS=
+  enable_openmp="yes"
+  if test "$enable_openmp" != no; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for fc option to support OpenMP" >&5
+$as_echo_n "checking for fc option to support OpenMP... " >&6; }
+if ${ac_cv_prog_fc_openmp+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.$ac_ext <<_ACEOF
+
+      program test_openmp
+       use omp_lib
+       implicit none
+!$     integer :: foobar
+       foobar = omp_get_num_threads()
+      end program
+
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_prog_fc_openmp='none needed'
+else
+  ac_cv_prog_fc_openmp='unsupported'
+	  	  	  	  	  	  	  	  	  	  	  	  	  	  for ac_option in -fopenmp -qopenmp -xopenmp -mp -omp -qsmp=omp -openmp; do
+	    ac_save_FCFLAGS=$FCFLAGS
+	    FCFLAGS="$FCFLAGS $ac_option"
+	    cat > conftest.$ac_ext <<_ACEOF
+
+      program test_openmp
+       use omp_lib
+       implicit none
+!$     integer :: foobar
+       foobar = omp_get_num_threads()
+      end program
+
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_prog_fc_openmp=$ac_option
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	    FCFLAGS=$ac_save_FCFLAGS
+	    if test "$ac_cv_prog_fc_openmp" != unsupported; then
+	      break
+	    fi
+	  done
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_openmp" >&5
+$as_echo "$ac_cv_prog_fc_openmp" >&6; }
+    case $ac_cv_prog_fc_openmp in #(
+      "none needed" | unsupported)
+        ;; #(
+      *)
+        OPENMP_FCFLAGS=$ac_cv_prog_fc_openmp ;;
+    esac
+  fi
+
+
+  if test "$ac_cv_prog_fc_openmp" = unsupported; then
+    as_fn_error $? "Could not compile a Fortran program with OpenMP, adjust FCFLAGS" "$LINENO" 5
+  fi
+  FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
+fi
+
+if test x"$with_mpi" = x"yes"; then
+  for ac_prog in mpiexec.hydra mpiexec mpirun poe runjob srun aprun
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_MPI_BINARY+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$MPI_BINARY"; then
+  ac_cv_prog_MPI_BINARY="$MPI_BINARY" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_MPI_BINARY="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+MPI_BINARY=$ac_cv_prog_MPI_BINARY
+if test -n "$MPI_BINARY"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $MPI_BINARY" >&5
+$as_echo "$MPI_BINARY" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$MPI_BINARY" && break
+done
+test -n "$MPI_BINARY" || MPI_BINARY="no"
+
+  if test x"$MPI_BINARY" = x"no"; then
+    as_fn_error $? "Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun, aprun" "$LINENO" 5
+  fi
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether stdout/stderr file redirect should be enabled" >&5
+$as_echo_n "checking whether stdout/stderr file redirect should be enabled... " >&6; }
+# Check whether --enable-redirect was given.
+if test "${enable_redirect+set}" = set; then :
+  enableval=$enable_redirect;
+               if test x"$enableval" = x"yes"; then
+                 enable_redirect=yes
+               else
+                 enable_redirect=no
+               fi
+
+else
+  enable_redirect=no
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_redirect}" >&5
+$as_echo "${enable_redirect}" >&6; }
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ELPA library should contain also single precision functions" >&5
+$as_echo_n "checking whether ELPA library should contain also single precision functions... " >&6; }
+# Check whether --enable-single-precision was given.
+if test "${enable_single_precision+set}" = set; then :
+  enableval=$enable_single_precision; want_single_precision="$enableval"
+else
+  want_single_precision="no"
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${want_single_precision}" >&5
+$as_echo "${want_single_precision}" >&6; }
+
+
+if test x"${enable_redirect}" = x"yes"; then
+
+$as_echo "#define HAVE_REDIRECT 1" >>confdefs.h
+
+fi
+ if test x"$enable_redirect" = x"yes"; then
+  HAVE_REDIRECT_TRUE=
+  HAVE_REDIRECT_FALSE='#'
+else
+  HAVE_REDIRECT_TRUE='#'
+  HAVE_REDIRECT_FALSE=
+fi
+
+
+
+# Check whether --enable-timings was given.
+if test "${enable_timings+set}" = set; then :
+  enableval=$enable_timings;
+               if test x"$enableval" = x"yes"; then
+                 enable_timings=yes
+               else
+                 enable_timings=no
+               fi
+
+else
+  enable_timings=yes
+fi
+
+
+if test x"${enable_timings}" = x"yes"; then
+
+$as_echo "#define HAVE_DETAILED_TIMINGS 1" >>confdefs.h
+
+fi
+ if test x"$enable_timings" = x"yes"; then
+  HAVE_DETAILED_TIMINGS_TRUE=
+  HAVE_DETAILED_TIMINGS_FALSE='#'
+else
+  HAVE_DETAILED_TIMINGS_TRUE='#'
+  HAVE_DETAILED_TIMINGS_FALSE=
+fi
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+# Check whether --with-papi was given.
+if test "${with_papi+set}" = set; then :
+  withval=$with_papi;
+               if test x"$enableval" = x"yes"; then
+                 with_papi=yes
+               else
+                 with_papi=no
+               fi
+
+else
+  with_papi="no"
+fi
+
+if test x"${enable_timings}" = x"yes"; then
+  if test x"$with_papi" = x"yes" ; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing PAPI_library_init" >&5
+$as_echo_n "checking for library containing PAPI_library_init... " >&6; }
+if ${ac_cv_search_PAPI_library_init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char PAPI_library_init ();
+int
+main ()
+{
+return PAPI_library_init ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' papi; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_search_PAPI_library_init=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_PAPI_library_init+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_PAPI_library_init+:} false; then :
+
+else
+  ac_cv_search_PAPI_library_init=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_PAPI_library_init" >&5
+$as_echo "$ac_cv_search_PAPI_library_init" >&6; }
+ac_res=$ac_cv_search_PAPI_library_init
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  papi_found=yes
+else
+  papi_found=no
+fi
+
+    if test x"$papi_found" = x"no" ; then
+      as_fn_error $? "\"Could not find usable PAPI installation, please install or adjust CFLAGS, LDFLAGS\"" "$LINENO" 5
+    fi
+
+$as_echo "#define HAVE_LIBPAPI 1" >>confdefs.h
+
+  fi
+fi
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+# Check whether --with-likwid was given.
+if test "${with_likwid+set}" = set; then :
+  withval=$with_likwid; with_likwid="$withval"
+else
+  with_likwid="no"
+fi
+
+
+if test x"$with_likwid" != x"no" ; then
+  if test -d $with_likwid/lib ; then
+    LDFLAGS="-L$with_likwid/lib $LDFLAGS"
+  fi
+  if test -d $with_likwid/lib64 ; then
+    LDFLAGS="-L$with_likwid/lib64 $LDFLAGS"
+  fi
+  if test -d $with_likwid/include ; then
+    FCFLAGS="-I$with_likwid/include $FCFLAGS"
+  fi
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing likwid_markerInit" >&5
+$as_echo_n "checking for library containing likwid_markerInit... " >&6; }
+if ${ac_cv_search_likwid_markerInit+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call likwid_markerInit
+      end
+_ACEOF
+for ac_lib in '' likwid; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_search_likwid_markerInit=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_likwid_markerInit+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_likwid_markerInit+:} false; then :
+
+else
+  ac_cv_search_likwid_markerInit=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_likwid_markerInit" >&5
+$as_echo "$ac_cv_search_likwid_markerInit" >&6; }
+ac_res=$ac_cv_search_likwid_markerInit
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  liblikwid_found="yes"
+else
+  liblikwid_found="no"
+fi
+
+  if test x"$liblikwid_found" = x"no" ; then
+    as_fn_error $? "Could not find a usable likwid library, please adjust LDFLAGS" "$LINENO" 5
+  fi
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can use the likwid module in a Fortran program" >&5
+$as_echo_n "checking whether we can use the likwid module in a Fortran program... " >&6; }
+  cat > conftest.$ac_ext <<_ACEOF
+
+       program foo
+       use likwid
+
+       implicit none
+
+       call likwid_markerInit()
+       call likwid_markerThreadInit()
+
+       call likwid_markerStartRegion("foobar")
+       call likwid_markerStopRegion("foobar")
+
+       call likwid_markerClose()
+       end
+
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+     as_fn_error $? "Could not compile a Fortran program using the likwid module, adjust FCFLAGS" "$LINENO" 5
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+$as_echo "#define HAVE_LIKWID 1" >>confdefs.h
+
+fi
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
+save_FCFLAGS=$FCFLAGS
+save_LDFLAGS=$LDFLAGS
+
+
+
+
+FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
+LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether Fortran module iso_fortran_env is available" >&5
+$as_echo_n "checking whether Fortran module iso_fortran_env is available... " >&6; }
+cat > conftest.$ac_ext <<_ACEOF
+
+  program test_error_unit
+    use iso_fortran_env, only : error_unit
+    implicit none
+
+    write(error_unit,*) "error_unit is defined"
+  end program
+
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  can_use_iso_fortran_env=yes
+else
+  can_use_iso_fortran_env=no
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_use_iso_fortran_env}" >&5
+$as_echo "${can_use_iso_fortran_env}" >&6; }
+if test x"${can_use_iso_fortran_env}" = x"yes" ; then
+
+$as_echo "#define HAVE_ISO_FORTRAN_ENV 1" >>confdefs.h
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile a Fortran program using MKL" >&5
+$as_echo_n "checking whether we can compile a Fortran program using MKL... " >&6; }
+cat > conftest.$ac_ext <<_ACEOF
+
+  program test_mkl
+    use mkl_service
+    character*198 :: string
+    call mkl_get_version_string(string)
+    write(*,'(a)') string
+  end program
+
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  can_compile_with_mkl=yes
+else
+  can_compile_with_mkl=no
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_with_mkl}" >&5
+$as_echo "${can_compile_with_mkl}" >&6; }
+
+if test x"$can_compile_with_mkl" = x"yes" ; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can link a Fortran program with MKL" >&5
+$as_echo_n "checking whether we can link a Fortran program with MKL... " >&6; }
+  cat > conftest.$ac_ext <<_ACEOF
+
+    program test_mkl
+      use mkl_service
+      character*198 :: string
+      call mkl_get_version_string(string)
+      write(*,'(a)') string
+    end program
+
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+  have_mkl=yes
+else
+  have_mkl=no
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${have_mkl}" >&5
+$as_echo "${have_mkl}" >&6; }
+fi
+
+if test x"${have_mkl}" = x"yes" ; then
+  WITH_MKL=1
+else
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing dgemm" >&5
+$as_echo_n "checking for library containing dgemm... " >&6; }
+if ${ac_cv_search_dgemm+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call dgemm
+      end
+_ACEOF
+for ac_lib in '' openblas satlas blas; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_search_dgemm=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_dgemm+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_dgemm+:} false; then :
+
+else
+  ac_cv_search_dgemm=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_dgemm" >&5
+$as_echo "$ac_cv_search_dgemm" >&6; }
+ac_res=$ac_cv_search_dgemm
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  have_blas=yes
+else
+  have_blas=no
+fi
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can link a program with a blas lib" >&5
+$as_echo_n "checking whether we can link a program with a blas lib... " >&6; }
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${have_blas}" >&5
+$as_echo "${have_blas}" >&6; }
+
+  if test x"${have_blas}" = x"no" ; then
+    as_fn_error $? "could not link with blas: specify path" "$LINENO" 5
+  fi
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing dlarrv" >&5
+$as_echo_n "checking for library containing dlarrv... " >&6; }
+if ${ac_cv_search_dlarrv+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call dlarrv
+      end
+_ACEOF
+for ac_lib in '' lapack; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_search_dlarrv=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_dlarrv+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_dlarrv+:} false; then :
+
+else
+  ac_cv_search_dlarrv=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_dlarrv" >&5
+$as_echo "$ac_cv_search_dlarrv" >&6; }
+ac_res=$ac_cv_search_dlarrv
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  have_lapack=yes
+else
+  have_lapack=no
+fi
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can link a program with a lapack lib" >&5
+$as_echo_n "checking whether we can link a program with a lapack lib... " >&6; }
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${have_lapack}" >&5
+$as_echo "${have_lapack}" >&6; }
+
+  if test x"${have_lapack}" = x"no" ; then
+    as_fn_error $? "could not link with lapack: specify path" "$LINENO" 5
+  fi
+
+  if test x"${with_mpi}" = x"yes"; then
+        scalapack_libs="mpiscalapack scalapack scalapack-openmpi"
+    old_LIBS="$LIBS"
+    for lib in ${scalapack_libs}; do
+      LIBS="-l${lib} ${old_LIBS}"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether -l${lib} already contains a BLACS implementation" >&5
+$as_echo_n "checking whether -l${lib} already contains a BLACS implementation... " >&6; }
+      cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call blacs_gridinit
+      end
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+  blacs_in_scalapack=yes
+else
+  blacs_in_scalapack=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${blacs_in_scalapack}" >&5
+$as_echo "${blacs_in_scalapack}" >&6; }
+      if test x"${blacs_in_scalapack}" = x"yes"; then
+        break
+      fi
+    done
+
+    if test x"${blacs_in_scalapack}" = x"no"; then
+      LIBS="${old_LIBS}"
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing bi_f77_init" >&5
+$as_echo_n "checking for library containing bi_f77_init... " >&6; }
+if ${ac_cv_search_bi_f77_init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call bi_f77_init
+      end
+_ACEOF
+for ac_lib in '' mpiblacsF77init; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib -lmpiblacs $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_search_bi_f77_init=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_bi_f77_init+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_bi_f77_init+:} false; then :
+
+else
+  ac_cv_search_bi_f77_init=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_bi_f77_init" >&5
+$as_echo "$ac_cv_search_bi_f77_init" >&6; }
+ac_res=$ac_cv_search_bi_f77_init
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+
+fi
+
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing blacs_gridinit" >&5
+$as_echo_n "checking for library containing blacs_gridinit... " >&6; }
+if ${ac_cv_search_blacs_gridinit+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call blacs_gridinit
+      end
+_ACEOF
+for ac_lib in '' blacs-openmpi; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib -lblacsCinit-openmpi -lscalapack-openmpi $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_search_blacs_gridinit=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_blacs_gridinit+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_blacs_gridinit+:} false; then :
+
+else
+  ac_cv_search_blacs_gridinit=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_blacs_gridinit" >&5
+$as_echo "$ac_cv_search_blacs_gridinit" >&6; }
+ac_res=$ac_cv_search_blacs_gridinit
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  have_blacs=yes
+else
+  have_blacs=no
+fi
+
+      if test x"${have_blacs}" = x"no"; then
+        unset ac_cv_search_blacs_gridinit
+      fi
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing blacs_gridinit" >&5
+$as_echo_n "checking for library containing blacs_gridinit... " >&6; }
+if ${ac_cv_search_blacs_gridinit+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call blacs_gridinit
+      end
+_ACEOF
+for ac_lib in '' mpiblacs blacs; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_search_blacs_gridinit=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_blacs_gridinit+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_blacs_gridinit+:} false; then :
+
+else
+  ac_cv_search_blacs_gridinit=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_blacs_gridinit" >&5
+$as_echo "$ac_cv_search_blacs_gridinit" >&6; }
+ac_res=$ac_cv_search_blacs_gridinit
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  have_blacs=yes
+else
+  have_blacs=no
+fi
+
+
+      if test x"${have_blacs}" = x"no"; then
+        as_fn_error $? "No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure" "$LINENO" 5
+      fi
+    fi
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing pdtran" >&5
+$as_echo_n "checking for library containing pdtran... " >&6; }
+if ${ac_cv_search_pdtran+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat > conftest.$ac_ext <<_ACEOF
+      program main
+      call pdtran
+      end
+_ACEOF
+for ac_lib in '' $scalapack_libs; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_fc_try_link "$LINENO"; then :
+  ac_cv_search_pdtran=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_pdtran+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_pdtran+:} false; then :
+
+else
+  ac_cv_search_pdtran=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_pdtran" >&5
+$as_echo "$ac_cv_search_pdtran" >&6; }
+ac_res=$ac_cv_search_pdtran
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  have_scalapack=yes
+else
+  have_scalapack=no
+fi
+
+
+    if test x"${have_scalapack}" = x"no" ; then
+      as_fn_error $? "could not link with scalapack: specify path" "$LINENO" 5
+    fi
+  fi
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can link a Fortran program with all blacs/scalapack" >&5
+$as_echo_n "checking whether we can link a Fortran program with all blacs/scalapack... " >&6; }
+  cat > conftest.$ac_ext <<_ACEOF
+
+    program dgemm_test
+
+      integer , parameter:: M = 4, N = 3, K = 2
+      real :: A(M,K), B(K,N), C(M,N)
+
+      call dgemm('N','N',M,N,K,1.0,A,M,B,K,0.0,C,M)
+
+     end program dgemm_test
+
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+  can_link_with_blacs_scalapack=yes
+else
+  can_link_with_blacs_scalapack=no
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+   { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_link_with_blacs_scalapack}" >&5
+$as_echo "${can_link_with_blacs_scalapack}" >&6; }
+
+   if test x"${can_link_with_blacs_scalapack}" = x"yes" ; then
+     WITH_BLACS=1
+   else
+   as_fn_error $? "We can neither link with MKL or another Scalpack. Please specify SCALAPACK_LDFLAGS and SCALAPACK_FCFLAGS!" "$LINENO" 5
+   fi
+fi
+
+FCFLAGS=$save_FCFLAGS
+LDFLAGS=$save_LDFLAGS
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can use the intrinsic Fortran function \"get_environment_variable\"" >&5
+$as_echo_n "checking whether we can use the intrinsic Fortran function \"get_environment_variable\"... " >&6; }
+
+
+cat > conftest.$ac_ext <<_ACEOF
+
+  program test_get_environment
+    character(len=256) :: homedir
+    call get_environment_variable("HOME",homedir)
+  end program
+
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  fortran_can_check_environment=yes
+else
+  fortran_can_check_environment=no
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${fortran_can_check_environment}" >&5
+$as_echo "${fortran_can_check_environment}" >&6; }
+if test x"${fortran_can_check_environment}" = x"yes" ; then
+
+$as_echo "#define HAVE_ENVIRONMENT_CHECKING 1" >>confdefs.h
+
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-done
-rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
-FCFLAGS=$ac_fc_freeform_FCFLAGS_save
 
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BAND_TO_FLULL_BLOCKING is requested" >&5
+$as_echo_n "checking whether BAND_TO_FLULL_BLOCKING is requested... " >&6; }
+# Check whether --enable-band-to-full-blocking was given.
+if test "${enable_band_to_full_blocking+set}" = set; then :
+  enableval=$enable_band_to_full_blocking;
+               if test x"$enableval" = x"yes"; then
+                 enable_band_to_full_blocking=yes
+               else
+                 enable_band_to_full_blocking=no
+               fi
+
+else
+  enable_band_to_full_blocking="yes"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_freeform" >&5
-$as_echo "$ac_cv_fc_freeform" >&6; }
-if test "x$ac_cv_fc_freeform" = xunknown; then
-  as_fn_error 77 "Fortran does not accept free-form source" "$LINENO" 5
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_band_to_full_blocking}" >&5
+$as_echo "${enable_band_to_full_blocking}" >&6; }
+
+ if test x"$enable_band_to_full_blocking" = x"yes"; then
+  BAND_TO_FULL_BLOCKING_TRUE=
+  BAND_TO_FULL_BLOCKING_FALSE='#'
 else
-  if test "x$ac_cv_fc_freeform" != xnone; then
-    FCFLAGS="$FCFLAGS $ac_cv_fc_freeform"
-  fi
+  BAND_TO_FULL_BLOCKING_TRUE='#'
+  BAND_TO_FULL_BLOCKING_FALSE=
+fi
+
+if test x"${enable_band_to_full_blocking}" = x"yes"; then
+
+$as_echo "#define BAND_TO_FULL_BLOCKING 1" >>confdefs.h
 
 fi
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking Fortran 90 module inclusion flag" >&5
-$as_echo_n "checking Fortran 90 module inclusion flag... " >&6; }
-if ${ac_cv_fc_module_flag+:} false; then :
-  $as_echo_n "(cached) " >&6
+
+# Check whether --with-cuda-path was given.
+if test "${with_cuda_path+set}" = set; then :
+  withval=$with_cuda_path; CUDA_INSTALL_PATH=$withval
 else
-  ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+  with_cuda=auto
+fi
 
-ac_cv_fc_module_flag=unknown
-mkdir conftest.dir
-cd conftest.dir
-cat > conftest.$ac_ext <<_ACEOF
 
-      module conftest_module
-      contains
-      subroutine conftest_routine
-      write(*,'(a)') 'gotcha!'
-      end subroutine
-      end module
-_ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  cd ..
-   ac_fc_module_flag_FCFLAGS_save=$FCFLAGS
-   # Flag ordering is significant for gfortran and Sun.
-   for ac_flag in -M -I '-I ' '-M ' -p '-mod ' '-module ' '-Am -I'; do
-     # Add the flag twice to prevent matching an output flag.
-     FCFLAGS="$ac_fc_module_flag_FCFLAGS_save ${ac_flag}conftest.dir ${ac_flag}conftest.dir"
-     cat > conftest.$ac_ext <<_ACEOF
 
-      program main
-      use conftest_module
-      call conftest_routine
+# Check whether --with-cuda-sdk-path was given.
+if test "${with_cuda_sdk_path+set}" = set; then :
+  withval=$with_cuda_sdk_path; CUDA_SDK_INSTALL_PATH=$withval
+else
+  with_cuda_sdk=auto
+fi
+
+
+
+user_sets_gpu_compute_capability="no"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a GPU compute capability is specified" >&5
+$as_echo_n "checking whether a GPU compute capability is specified... " >&6; }
+
+# Check whether --with-GPU-compute-capability was given.
+if test "${with_GPU_compute_capability+set}" = set; then :
+  withval=$with_GPU_compute_capability; user_sets_gpu_compute_capability="yes"
+else
+  cuda_compute_capability="sm_35"
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${user_sets_gpu_compute_capability}" >&5
+$as_echo "${user_sets_gpu_compute_capability}" >&6; }
+
+
+if test x"${user_sets_gpu_compute_capability}" = x"yes" ; then
+    value=$(echo $withval | cut -c1-3)
+  if test x"${value}" = x"sm_" ; then
+    cuda_compute_capability=$withval
+  else
+    as_fn_error $? "Unknown GPU compute capability set: ${withval}" "$LINENO" 5
+  fi
+fi
+
+
+if test x"${with_mpi}" = x"yes" ; then
+  # Check whether --enable-mpi-module was given.
+if test "${enable_mpi_module+set}" = set; then :
+  enableval=$enable_mpi_module;
+                 if test x"$enableval" = x"yes"; then
+                   enable_mpi_module=yes
+                 else
+                   enable_mpi_module=no
+                 fi
+
+else
+  enable_mpi_module=yes
+fi
+
+  if test x"${enable_mpi_module}" = x"yes" ; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether Fortran mpi module can be used" >&5
+$as_echo_n "checking whether Fortran mpi module can be used... " >&6; }
+    cat > conftest.$ac_ext <<_ACEOF
+
+      program test_mpi_module
+        use mpi
+        real :: time
+        time = MPI_WTime()
       end program
+
 _ACEOF
 if ac_fn_fc_try_compile "$LINENO"; then :
-  ac_cv_fc_module_flag="$ac_flag"
+  can_use_fortran_mpi_module=yes
+else
+  can_use_fortran_mpi_module=no
+
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-     if test "$ac_cv_fc_module_flag" != unknown; then
-       break
-     fi
-   done
-   FCFLAGS=$ac_fc_module_flag_FCFLAGS_save
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_use_fortran_mpi_module}" >&5
+$as_echo "${can_use_fortran_mpi_module}" >&6; }
+    if test x"${can_use_fortran_mpi_module}" = x"yes" ; then
+
+$as_echo "#define HAVE_MPI_MODULE 1" >>confdefs.h
+
+    else
+      as_fn_error $? "Could not compile a Fortran program with an 'use mpi' statement. You can try again with --disable-mpi-module" "$LINENO" 5
+    fi
+  fi
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#m4_define(elpa_m4_gpu_kernels, [
+#        real_gpu
+#        complex_gpu
+#])
+
+
+
+
+
+
+
+
+
+
+
+
+        # Check whether --enable-generic was given.
+if test "${enable_generic+set}" = set; then :
+  enableval=$enable_generic;
+else
+  enable_generic=yes
+fi
+
+
+                use_real_generic=$enable_generic
+
+                use_real_generic_simple=$enable_generic
+
+                use_real_generic_simple_block4=$enable_generic
+
+                use_real_generic_simple_block6=$enable_generic
+
+                use_complex_generic=$enable_generic
+
+                use_complex_generic_simple=$enable_generic
+
+
+
+        # Check whether --enable-sparc64 was given.
+if test "${enable_sparc64+set}" = set; then :
+  enableval=$enable_sparc64;
+else
+  enable_sparc64=no
+fi
+
+
+                use_real_sparc64_block2=$enable_sparc64
+
+                use_real_sparc64_block4=$enable_sparc64
+
+                use_real_sparc64_block6=$enable_sparc64
+
+
+
+        # Check whether --enable-neon-arch64 was given.
+if test "${enable_neon_arch64+set}" = set; then :
+  enableval=$enable_neon_arch64;
+else
+  enable_neon_arch64=no
+fi
+
+
+                use_real_neon_arch64_block2=$enable_neon_arch64
+
+                use_real_neon_arch64_block4=$enable_neon_arch64
+
+                use_real_neon_arch64_block6=$enable_neon_arch64
+
+
+
+        # Check whether --enable-vsx was given.
+if test "${enable_vsx+set}" = set; then :
+  enableval=$enable_vsx;
+else
+  enable_vsx=no
+fi
+
+
+                use_real_vsx_block2=$enable_vsx
+
+                use_real_vsx_block4=$enable_vsx
+
+                use_real_vsx_block6=$enable_vsx
+
+
+
+        # Check whether --enable-sse was given.
+if test "${enable_sse+set}" = set; then :
+  enableval=$enable_sse;
+else
+  enable_sse=yes
+fi
+
+
+                use_real_sse_block2=$enable_sse
+
+                use_real_sse_block4=$enable_sse
+
+                use_real_sse_block6=$enable_sse
+
+                use_complex_sse_block1=$enable_sse
+
+                use_complex_sse_block2=$enable_sse
+
+
+
+        # Check whether --enable-sse-assembly was given.
+if test "${enable_sse_assembly+set}" = set; then :
+  enableval=$enable_sse_assembly;
+else
+  enable_sse_assembly=yes
+fi
+
+
+                use_real_sse_assembly=$enable_sse_assembly
+
+                use_complex_sse_assembly=$enable_sse_assembly
+
+
+
+        # Check whether --enable-avx was given.
+if test "${enable_avx+set}" = set; then :
+  enableval=$enable_avx;
+else
+  enable_avx=yes
+fi
+
+
+                use_real_avx_block2=$enable_avx
+
+                use_real_avx_block4=$enable_avx
+
+                use_real_avx_block6=$enable_avx
+
+                use_complex_avx_block1=$enable_avx
+
+                use_complex_avx_block2=$enable_avx
+
+
+
+        # Check whether --enable-avx2 was given.
+if test "${enable_avx2+set}" = set; then :
+  enableval=$enable_avx2;
+else
+  enable_avx2=yes
+fi
+
+
+                use_real_avx2_block2=$enable_avx2
+
+                use_real_avx2_block4=$enable_avx2
+
+                use_real_avx2_block6=$enable_avx2
+
+                use_complex_avx2_block1=$enable_avx2
+
+                use_complex_avx2_block2=$enable_avx2
+
+
+
+        # Check whether --enable-avx512 was given.
+if test "${enable_avx512+set}" = set; then :
+  enableval=$enable_avx512;
+else
+  enable_avx512=yes
+fi
+
+
+                use_real_avx512_block2=$enable_avx512
+
+                use_real_avx512_block4=$enable_avx512
+
+                use_real_avx512_block6=$enable_avx512
+
+                use_complex_avx512_block1=$enable_avx512
+
+                use_complex_avx512_block2=$enable_avx512
+
+
+#ELPA_SELECT_KERNELS([gpu],[disable])
+
+        # Check whether --enable-bgp was given.
+if test "${enable_bgp+set}" = set; then :
+  enableval=$enable_bgp;
+else
+  enable_bgp=no
+fi
+
+
+                use_real_bgp=$enable_bgp
+
+                use_complex_bgp=$enable_bgp
+
+
+
+        # Check whether --enable-bgq was given.
+if test "${enable_bgq+set}" = set; then :
+  enableval=$enable_bgq;
+else
+  enable_bgq=no
+fi
+
+
+                use_real_bgq=$enable_bgq
+
+                use_complex_bgq=$enable_bgq
+
+
+
+
+        default_real_kernel=""
+
+        default_complex_kernel=""
+
+
+if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
+
+                if x"$use_real_sparc64_block2" = x"yes" ; then
+                        echo "Disabling real_sparc64_block2 due to BGP/BGQ option"
+                fi
+                use_real_sparc64_block2=no
+
+                if x"$use_real_sparc64_block4" = x"yes" ; then
+                        echo "Disabling real_sparc64_block4 due to BGP/BGQ option"
+                fi
+                use_real_sparc64_block4=no
+
+                if x"$use_real_sparc64_block6" = x"yes" ; then
+                        echo "Disabling real_sparc64_block6 due to BGP/BGQ option"
+                fi
+                use_real_sparc64_block6=no
+
+                if x"$use_real_neon_arch64_block2" = x"yes" ; then
+                        echo "Disabling real_neon_arch64_block2 due to BGP/BGQ option"
+                fi
+                use_real_neon_arch64_block2=no
+
+                if x"$use_real_neon_arch64_block4" = x"yes" ; then
+                        echo "Disabling real_neon_arch64_block4 due to BGP/BGQ option"
+                fi
+                use_real_neon_arch64_block4=no
+
+                if x"$use_real_neon_arch64_block6" = x"yes" ; then
+                        echo "Disabling real_neon_arch64_block6 due to BGP/BGQ option"
+                fi
+                use_real_neon_arch64_block6=no
+
+                if x"$use_real_vsx_block2" = x"yes" ; then
+                        echo "Disabling real_vsx_block2 due to BGP/BGQ option"
+                fi
+                use_real_vsx_block2=no
+
+                if x"$use_real_vsx_block4" = x"yes" ; then
+                        echo "Disabling real_vsx_block4 due to BGP/BGQ option"
+                fi
+                use_real_vsx_block4=no
 
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-rm -rf conftest.dir
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+                if x"$use_real_vsx_block6" = x"yes" ; then
+                        echo "Disabling real_vsx_block6 due to BGP/BGQ option"
+                fi
+                use_real_vsx_block6=no
 
+                if x"$use_real_sse_block2" = x"yes" ; then
+                        echo "Disabling real_sse_block2 due to BGP/BGQ option"
+                fi
+                use_real_sse_block2=no
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_module_flag" >&5
-$as_echo "$ac_cv_fc_module_flag" >&6; }
-if test "$ac_cv_fc_module_flag" != unknown; then
-  FC_MODINC=$ac_cv_fc_module_flag
+                if x"$use_real_sse_block4" = x"yes" ; then
+                        echo "Disabling real_sse_block4 due to BGP/BGQ option"
+                fi
+                use_real_sse_block4=no
 
-else
-  FC_MODINC=
-  as_fn_error $? "unable to find compiler flag for module search path" "$LINENO" 5
-fi
+                if x"$use_real_sse_block6" = x"yes" ; then
+                        echo "Disabling real_sse_block6 due to BGP/BGQ option"
+                fi
+                use_real_sse_block6=no
 
-# Ensure trailing whitespace is preserved in a Makefile.
-ac_empty=""
+                if x"$use_complex_sse_block1" = x"yes" ; then
+                        echo "Disabling complex_sse_block1 due to BGP/BGQ option"
+                fi
+                use_complex_sse_block1=no
 
+                if x"$use_complex_sse_block2" = x"yes" ; then
+                        echo "Disabling complex_sse_block2 due to BGP/BGQ option"
+                fi
+                use_complex_sse_block2=no
 
+                if x"$use_real_avx_block2" = x"yes" ; then
+                        echo "Disabling real_avx_block2 due to BGP/BGQ option"
+                fi
+                use_real_avx_block2=no
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking Fortran 90 module output flag" >&5
-$as_echo_n "checking Fortran 90 module output flag... " >&6; }
-if ${ac_cv_fc_module_output_flag+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+                if x"$use_real_avx_block4" = x"yes" ; then
+                        echo "Disabling real_avx_block4 due to BGP/BGQ option"
+                fi
+                use_real_avx_block4=no
 
-mkdir conftest.dir conftest.dir/sub
-cd conftest.dir
-ac_cv_fc_module_output_flag=unknown
-ac_fc_module_output_flag_FCFLAGS_save=$FCFLAGS
-# Flag ordering is significant: put flags late which some compilers use
-# for the search path.
-for ac_flag in -J '-J ' -fmod= -moddir= +moddir= -qmoddir= '-mod ' \
-	      '-module ' -M '-Am -M' '-e m -J '; do
-  FCFLAGS="$ac_fc_module_output_flag_FCFLAGS_save ${ac_flag}sub"
-  cat > conftest.$ac_ext <<_ACEOF
+                if x"$use_real_avx_block6" = x"yes" ; then
+                        echo "Disabling real_avx_block6 due to BGP/BGQ option"
+                fi
+                use_real_avx_block6=no
 
-      module conftest_module
-      contains
-      subroutine conftest_routine
-      write(*,'(a)') 'gotcha!'
-      end subroutine
-      end module
-_ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  cd sub
-     cat > conftest.$ac_ext <<_ACEOF
+                if x"$use_complex_avx_block1" = x"yes" ; then
+                        echo "Disabling complex_avx_block1 due to BGP/BGQ option"
+                fi
+                use_complex_avx_block1=no
 
-      program main
-      use conftest_module
-      call conftest_routine
-      end program
-_ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  ac_cv_fc_module_output_flag="$ac_flag"
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-     cd ..
-     if test "$ac_cv_fc_module_output_flag" != unknown; then
-       break
-     fi
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-done
-FCFLAGS=$ac_fc_module_output_flag_FCFLAGS_save
-cd ..
-rm -rf conftest.dir
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+                if x"$use_complex_avx_block2" = x"yes" ; then
+                        echo "Disabling complex_avx_block2 due to BGP/BGQ option"
+                fi
+                use_complex_avx_block2=no
 
+                if x"$use_real_avx2_block2" = x"yes" ; then
+                        echo "Disabling real_avx2_block2 due to BGP/BGQ option"
+                fi
+                use_real_avx2_block2=no
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_module_output_flag" >&5
-$as_echo "$ac_cv_fc_module_output_flag" >&6; }
-if test "$ac_cv_fc_module_output_flag" != unknown; then
-  FC_MODOUT=$ac_cv_fc_module_output_flag
+                if x"$use_real_avx2_block4" = x"yes" ; then
+                        echo "Disabling real_avx2_block4 due to BGP/BGQ option"
+                fi
+                use_real_avx2_block4=no
 
-else
-  FC_MODOUT=
-  as_fn_error $? "unable to find compiler flag to write module information to" "$LINENO" 5
-fi
+                if x"$use_real_avx2_block6" = x"yes" ; then
+                        echo "Disabling real_avx2_block6 due to BGP/BGQ option"
+                fi
+                use_real_avx2_block6=no
 
-# Ensure trailing whitespace is preserved in a Makefile.
-ac_empty=""
+                if x"$use_complex_avx2_block1" = x"yes" ; then
+                        echo "Disabling complex_avx2_block1 due to BGP/BGQ option"
+                fi
+                use_complex_avx2_block1=no
 
+                if x"$use_complex_avx2_block2" = x"yes" ; then
+                        echo "Disabling complex_avx2_block2 due to BGP/BGQ option"
+                fi
+                use_complex_avx2_block2=no
 
-# Make sure we can run config.sub.
-$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
-  as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5
+                if x"$use_real_avx512_block2" = x"yes" ; then
+                        echo "Disabling real_avx512_block2 due to BGP/BGQ option"
+                fi
+                use_real_avx512_block2=no
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
-$as_echo_n "checking build system type... " >&6; }
-if ${ac_cv_build+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_build_alias=$build_alias
-test "x$ac_build_alias" = x &&
-  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
-test "x$ac_build_alias" = x &&
-  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
-ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
-  as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5
+                if x"$use_real_avx512_block4" = x"yes" ; then
+                        echo "Disabling real_avx512_block4 due to BGP/BGQ option"
+                fi
+                use_real_avx512_block4=no
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
-$as_echo "$ac_cv_build" >&6; }
-case $ac_cv_build in
-*-*-*) ;;
-*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
-esac
-build=$ac_cv_build
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_build
-shift
-build_cpu=$1
-build_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-build_os=$*
-IFS=$ac_save_IFS
-case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
+                if x"$use_real_avx512_block6" = x"yes" ; then
+                        echo "Disabling real_avx512_block6 due to BGP/BGQ option"
+                fi
+                use_real_avx512_block6=no
 
+                if x"$use_complex_avx512_block1" = x"yes" ; then
+                        echo "Disabling complex_avx512_block1 due to BGP/BGQ option"
+                fi
+                use_complex_avx512_block1=no
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
-$as_echo_n "checking host system type... " >&6; }
-if ${ac_cv_host+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "x$host_alias" = x; then
-  ac_cv_host=$ac_cv_build
-else
-  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
-    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5
-fi
+                if x"$use_complex_avx512_block2" = x"yes" ; then
+                        echo "Disabling complex_avx512_block2 due to BGP/BGQ option"
+                fi
+                use_complex_avx512_block2=no
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
-$as_echo "$ac_cv_host" >&6; }
-case $ac_cv_host in
-*-*-*) ;;
-*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
-esac
-host=$ac_cv_host
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_host
-shift
-host_cpu=$1
-host_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-host_os=$*
-IFS=$ac_save_IFS
-case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
 
 
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to get verbose linking output from $FC" >&5
-$as_echo_n "checking how to get verbose linking output from $FC... " >&6; }
-if ${ac_cv_prog_fc_v+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat > conftest.$ac_ext <<_ACEOF
-      program main
 
-      end
-_ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  ac_cv_prog_fc_v=
-# Try some options frequently used verbose output
-for ac_verb in -v -verbose --verbose -V -\#\#\#; do
-  cat > conftest.$ac_ext <<_ACEOF
-      program main
 
-      end
-_ACEOF
+# Check whether --with-fixed-real-kernel was given.
+if test "${with_fixed_real_kernel+set}" = set; then :
+  withval=$with_fixed_real_kernel; fixed_real_kernel="real_$withval"
+else
+  fixed_real_kernel=""
+fi
 
-# Compile and link our simple test program by passing a flag (argument
-# 1 to this macro) to the Fortran compiler in order to get
-# "verbose" output that we can then parse for the Fortran linker
-# flags.
-ac_save_FCFLAGS=$FCFLAGS
-FCFLAGS="$FCFLAGS $ac_verb"
-eval "set x $ac_link"
-shift
-$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5
-# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH,
-# LIBRARY_PATH; skip all such settings.
-ac_fc_v_output=`eval $ac_link 5>&1 2>&1 |
-  sed '/^Driving:/d; /^Configured with:/d;
-      '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"`
-$as_echo "$ac_fc_v_output" >&5
-FCFLAGS=$ac_save_FCFLAGS
+        if test -n "$fixed_real_kernel" ; then
+                found="no"
 
-rm -rf conftest*
+                        if test "$fixed_real_kernel" = "real_generic" ; then
+                                use_real_generic=yes
+                                found="yes"
+                        else
+                                use_real_generic=no
+                        fi
 
-# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where
-# /foo, /bar, and /baz are search directories for the Fortran linker.
-# Here, we change these into -L/foo -L/bar -L/baz (and put it first):
-ac_fc_v_output="`echo $ac_fc_v_output |
-	grep 'LPATH is:' |
-	sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_fc_v_output"
+                        if test "$fixed_real_kernel" = "real_generic_simple" ; then
+                                use_real_generic_simple=yes
+                                found="yes"
+                        else
+                                use_real_generic_simple=no
+                        fi
 
-# FIXME: we keep getting bitten by quoted arguments; a more general fix
-#        that detects unbalanced quotes in FLIBS should be implemented
-#        and (ugh) tested at some point.
-case $ac_fc_v_output in
-  # With xlf replace commas with spaces,
-  # and remove "-link" and closing parenthesis.
-  *xlfentry*)
-    ac_fc_v_output=`echo $ac_fc_v_output |
-      sed '
-        s/,/ /g
-        s/ -link / /g
-        s/) *$//
-      '
-    ` ;;
+                        if test "$fixed_real_kernel" = "real_generic_simple_block4" ; then
+                                use_real_generic_simple_block4=yes
+                                found="yes"
+                        else
+                                use_real_generic_simple_block4=no
+                        fi
 
-  # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted
-  # $LIBS confuse us, and the libraries appear later in the output anyway).
-  *mGLOB_options_string*)
-    ac_fc_v_output=`echo $ac_fc_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;;
+                        if test "$fixed_real_kernel" = "real_generic_simple_block6" ; then
+                                use_real_generic_simple_block6=yes
+                                found="yes"
+                        else
+                                use_real_generic_simple_block6=no
+                        fi
 
-  # Portland Group compiler has singly- or doubly-quoted -cmdline argument
-  # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4.
-  # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2".
-  *-cmdline\ * | *-ignore\ * | *-def\ *)
-    ac_fc_v_output=`echo $ac_fc_v_output | sed "\
-	s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
-	s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
-	s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
+                        if test "$fixed_real_kernel" = "real_sparc64_block2" ; then
+                                use_real_sparc64_block2=yes
+                                found="yes"
+                        else
+                                use_real_sparc64_block2=no
+                        fi
 
-  # If we are using fort77 (the f2c wrapper) then filter output and delete quotes.
-  *fort77*f2c*gcc*)
-    ac_fc_v_output=`echo "$ac_fc_v_output" | sed -n '
-        /:[	 ]\+Running[	 ]\{1,\}"gcc"/{
-          /"-c"/d
-          /[.]c"*/d
-          s/^.*"gcc"/"gcc"/
-          s/"//gp
-        }'` ;;
+                        if test "$fixed_real_kernel" = "real_sparc64_block4" ; then
+                                use_real_sparc64_block4=yes
+                                found="yes"
+                        else
+                                use_real_sparc64_block4=no
+                        fi
 
-  # If we are using Cray Fortran then delete quotes.
-  *cft90*)
-    ac_fc_v_output=`echo $ac_fc_v_output | sed 's/"//g'` ;;
-esac
+                        if test "$fixed_real_kernel" = "real_sparc64_block6" ; then
+                                use_real_sparc64_block6=yes
+                                found="yes"
+                        else
+                                use_real_sparc64_block6=no
+                        fi
 
+                        if test "$fixed_real_kernel" = "real_neon_arch64_block2" ; then
+                                use_real_neon_arch64_block2=yes
+                                found="yes"
+                        else
+                                use_real_neon_arch64_block2=no
+                        fi
 
-  # look for -l* and *.a constructs in the output
-  for ac_arg in $ac_fc_v_output; do
-     case $ac_arg in
-	[\\/]*.a | ?:[\\/]*.a | -[lLRu]*)
-	  ac_cv_prog_fc_v=$ac_verb
-	  break 2 ;;
-     esac
-  done
-done
-if test -z "$ac_cv_prog_fc_v"; then
-   { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot determine how to obtain linking information from $FC" >&5
-$as_echo "$as_me: WARNING: cannot determine how to obtain linking information from $FC" >&2;}
-fi
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: compilation failed" >&5
-$as_echo "$as_me: WARNING: compilation failed" >&2;}
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+                        if test "$fixed_real_kernel" = "real_neon_arch64_block4" ; then
+                                use_real_neon_arch64_block4=yes
+                                found="yes"
+                        else
+                                use_real_neon_arch64_block4=no
+                        fi
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_v" >&5
-$as_echo "$ac_cv_prog_fc_v" >&6; }
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran libraries of $FC" >&5
-$as_echo_n "checking for Fortran libraries of $FC... " >&6; }
-if ${ac_cv_fc_libs+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "x$FCLIBS" != "x"; then
-  ac_cv_fc_libs="$FCLIBS" # Let the user override the test.
-else
+                        if test "$fixed_real_kernel" = "real_neon_arch64_block6" ; then
+                                use_real_neon_arch64_block6=yes
+                                found="yes"
+                        else
+                                use_real_neon_arch64_block6=no
+                        fi
 
-cat > conftest.$ac_ext <<_ACEOF
-      program main
+                        if test "$fixed_real_kernel" = "real_vsx_block2" ; then
+                                use_real_vsx_block2=yes
+                                found="yes"
+                        else
+                                use_real_vsx_block2=no
+                        fi
 
-      end
-_ACEOF
+                        if test "$fixed_real_kernel" = "real_vsx_block4" ; then
+                                use_real_vsx_block4=yes
+                                found="yes"
+                        else
+                                use_real_vsx_block4=no
+                        fi
 
-# Compile and link our simple test program by passing a flag (argument
-# 1 to this macro) to the Fortran compiler in order to get
-# "verbose" output that we can then parse for the Fortran linker
-# flags.
-ac_save_FCFLAGS=$FCFLAGS
-FCFLAGS="$FCFLAGS $ac_cv_prog_fc_v"
-eval "set x $ac_link"
-shift
-$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5
-# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH,
-# LIBRARY_PATH; skip all such settings.
-ac_fc_v_output=`eval $ac_link 5>&1 2>&1 |
-  sed '/^Driving:/d; /^Configured with:/d;
-      '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"`
-$as_echo "$ac_fc_v_output" >&5
-FCFLAGS=$ac_save_FCFLAGS
+                        if test "$fixed_real_kernel" = "real_vsx_block6" ; then
+                                use_real_vsx_block6=yes
+                                found="yes"
+                        else
+                                use_real_vsx_block6=no
+                        fi
 
-rm -rf conftest*
+                        if test "$fixed_real_kernel" = "real_sse_block2" ; then
+                                use_real_sse_block2=yes
+                                found="yes"
+                        else
+                                use_real_sse_block2=no
+                        fi
 
-# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where
-# /foo, /bar, and /baz are search directories for the Fortran linker.
-# Here, we change these into -L/foo -L/bar -L/baz (and put it first):
-ac_fc_v_output="`echo $ac_fc_v_output |
-	grep 'LPATH is:' |
-	sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_fc_v_output"
+                        if test "$fixed_real_kernel" = "real_sse_block4" ; then
+                                use_real_sse_block4=yes
+                                found="yes"
+                        else
+                                use_real_sse_block4=no
+                        fi
 
-# FIXME: we keep getting bitten by quoted arguments; a more general fix
-#        that detects unbalanced quotes in FLIBS should be implemented
-#        and (ugh) tested at some point.
-case $ac_fc_v_output in
-  # With xlf replace commas with spaces,
-  # and remove "-link" and closing parenthesis.
-  *xlfentry*)
-    ac_fc_v_output=`echo $ac_fc_v_output |
-      sed '
-        s/,/ /g
-        s/ -link / /g
-        s/) *$//
-      '
-    ` ;;
+                        if test "$fixed_real_kernel" = "real_sse_block6" ; then
+                                use_real_sse_block6=yes
+                                found="yes"
+                        else
+                                use_real_sse_block6=no
+                        fi
 
-  # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted
-  # $LIBS confuse us, and the libraries appear later in the output anyway).
-  *mGLOB_options_string*)
-    ac_fc_v_output=`echo $ac_fc_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;;
+                        if test "$fixed_real_kernel" = "real_sse_assembly" ; then
+                                use_real_sse_assembly=yes
+                                found="yes"
+                        else
+                                use_real_sse_assembly=no
+                        fi
 
-  # Portland Group compiler has singly- or doubly-quoted -cmdline argument
-  # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4.
-  # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2".
-  *-cmdline\ * | *-ignore\ * | *-def\ *)
-    ac_fc_v_output=`echo $ac_fc_v_output | sed "\
-	s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
-	s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
-	s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
+                        if test "$fixed_real_kernel" = "real_avx_block2" ; then
+                                use_real_avx_block2=yes
+                                found="yes"
+                        else
+                                use_real_avx_block2=no
+                        fi
 
-  # If we are using fort77 (the f2c wrapper) then filter output and delete quotes.
-  *fort77*f2c*gcc*)
-    ac_fc_v_output=`echo "$ac_fc_v_output" | sed -n '
-        /:[	 ]\+Running[	 ]\{1,\}"gcc"/{
-          /"-c"/d
-          /[.]c"*/d
-          s/^.*"gcc"/"gcc"/
-          s/"//gp
-        }'` ;;
+                        if test "$fixed_real_kernel" = "real_avx_block4" ; then
+                                use_real_avx_block4=yes
+                                found="yes"
+                        else
+                                use_real_avx_block4=no
+                        fi
 
-  # If we are using Cray Fortran then delete quotes.
-  *cft90*)
-    ac_fc_v_output=`echo $ac_fc_v_output | sed 's/"//g'` ;;
-esac
+                        if test "$fixed_real_kernel" = "real_avx_block6" ; then
+                                use_real_avx_block6=yes
+                                found="yes"
+                        else
+                                use_real_avx_block6=no
+                        fi
 
+                        if test "$fixed_real_kernel" = "real_avx2_block2" ; then
+                                use_real_avx2_block2=yes
+                                found="yes"
+                        else
+                                use_real_avx2_block2=no
+                        fi
 
+                        if test "$fixed_real_kernel" = "real_avx2_block4" ; then
+                                use_real_avx2_block4=yes
+                                found="yes"
+                        else
+                                use_real_avx2_block4=no
+                        fi
 
-ac_cv_fc_libs=
+                        if test "$fixed_real_kernel" = "real_avx2_block6" ; then
+                                use_real_avx2_block6=yes
+                                found="yes"
+                        else
+                                use_real_avx2_block6=no
+                        fi
 
-# Save positional arguments (if any)
-ac_save_positional="$@"
+                        if test "$fixed_real_kernel" = "real_avx512_block2" ; then
+                                use_real_avx512_block2=yes
+                                found="yes"
+                        else
+                                use_real_avx512_block2=no
+                        fi
 
-set X $ac_fc_v_output
-while test $# != 1; do
-  shift
-  ac_arg=$1
-  case $ac_arg in
-	[\\/]*.a | ?:[\\/]*.a)
-	    ac_exists=false
-  for ac_i in $ac_cv_fc_libs; do
-    if test x"$ac_arg" = x"$ac_i"; then
-      ac_exists=true
-      break
-    fi
-  done
+                        if test "$fixed_real_kernel" = "real_avx512_block4" ; then
+                                use_real_avx512_block4=yes
+                                found="yes"
+                        else
+                                use_real_avx512_block4=no
+                        fi
 
-  if test x"$ac_exists" = xtrue; then :
+                        if test "$fixed_real_kernel" = "real_avx512_block6" ; then
+                                use_real_avx512_block6=yes
+                                found="yes"
+                        else
+                                use_real_avx512_block6=no
+                        fi
 
-else
-  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
-fi
-	  ;;
-	-bI:*)
-	    ac_exists=false
-  for ac_i in $ac_cv_fc_libs; do
-    if test x"$ac_arg" = x"$ac_i"; then
-      ac_exists=true
-      break
-    fi
-  done
+                        if test "$fixed_real_kernel" = "real_bgp" ; then
+                                use_real_bgp=yes
+                                found="yes"
+                        else
+                                use_real_bgp=no
+                        fi
 
-  if test x"$ac_exists" = xtrue; then :
+                        if test "$fixed_real_kernel" = "real_bgq" ; then
+                                use_real_bgq=yes
+                                found="yes"
+                        else
+                                use_real_bgq=no
+                        fi
 
-else
-  if test "$ac_compiler_gnu" = yes; then
-  for ac_link_opt in $ac_arg; do
-    ac_cv_fc_libs="$ac_cv_fc_libs -Xlinker $ac_link_opt"
-  done
-else
-  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
-fi
-fi
-	  ;;
-	  # Ignore these flags.
-	-lang* | -lcrt*.o | -lc | -lgcc* | -lSystem | -libmil | -little \
-	  |-LANG:=* | -LIST:* | -LNO:* | -link)
-	  ;;
-	-lkernel32)
-	  case $host_os in
-	  *cygwin*) ;;
-	  *) ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
-	    ;;
-	  esac
-	  ;;
-	-[LRuYz])
-	  # These flags, when seen by themselves, take an argument.
-	  # We remove the space between option and argument and re-iterate
-	  # unless we find an empty arg or a new option (starting with -)
-	  case $2 in
-	     "" | -*);;
-	     *)
-		ac_arg="$ac_arg$2"
-		shift; shift
-		set X $ac_arg "$@"
-		;;
-	  esac
-	  ;;
-	-YP,*)
-	  for ac_j in `$as_echo "$ac_arg" | sed -e 's/-YP,/-L/;s/:/ -L/g'`; do
-	      ac_exists=false
-  for ac_i in $ac_cv_fc_libs; do
-    if test x"$ac_j" = x"$ac_i"; then
-      ac_exists=true
-      break
-    fi
-  done
+                if test x"$found" = x"no" ; then
+                        as_fn_error $? "Invalid kernel \"$fixed_real_kernel\" specified for --with-fixed-real-kernel" "$LINENO" 5
+                fi
+                default_real_kernel="$fixed_real_kernel"
 
-  if test x"$ac_exists" = xtrue; then :
+$as_echo "#define WITH_FIXED_REAL_KERNEL 1" >>confdefs.h
 
-else
-  ac_arg="$ac_arg $ac_j"
-			       ac_cv_fc_libs="$ac_cv_fc_libs $ac_j"
-fi
-	  done
-	  ;;
-	-[lLR]*)
-	    ac_exists=false
-  for ac_i in $ac_cv_fc_libs; do
-    if test x"$ac_arg" = x"$ac_i"; then
-      ac_exists=true
-      break
-    fi
-  done
+        fi
 
-  if test x"$ac_exists" = xtrue; then :
 
+# Check whether --with-fixed-complex-kernel was given.
+if test "${with_fixed_complex_kernel+set}" = set; then :
+  withval=$with_fixed_complex_kernel; fixed_complex_kernel="complex_$withval"
 else
-  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
+  fixed_complex_kernel=""
 fi
-	  ;;
-	-zallextract*| -zdefaultextract)
-	  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
-	  ;;
-	  # Ignore everything else.
-  esac
-done
-# restore positional arguments
-set X $ac_save_positional; shift
 
-# We only consider "LD_RUN_PATH" on Solaris systems.  If this is seen,
-# then we insist that the "run path" must be an absolute path (i.e. it
-# must begin with a "/").
-case `(uname -sr) 2>/dev/null` in
-   "SunOS 5"*)
-      ac_ld_run_path=`$as_echo "$ac_fc_v_output" |
-			sed -n 's,^.*LD_RUN_PATH *= *\(/[^ ]*\).*$,-R\1,p'`
-      test "x$ac_ld_run_path" != x &&
-	if test "$ac_compiler_gnu" = yes; then
-  for ac_link_opt in $ac_ld_run_path; do
-    ac_cv_fc_libs="$ac_cv_fc_libs -Xlinker $ac_link_opt"
-  done
-else
-  ac_cv_fc_libs="$ac_cv_fc_libs $ac_ld_run_path"
-fi
-      ;;
-esac
-fi # test "x$[]_AC_LANG_PREFIX[]LIBS" = "x"
+        if test -n "$fixed_complex_kernel" ; then
+                found="no"
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_libs" >&5
-$as_echo "$ac_cv_fc_libs" >&6; }
-FCLIBS="$ac_cv_fc_libs"
+                        if test "$fixed_complex_kernel" = "complex_generic" ; then
+                                use_complex_generic=yes
+                                found="yes"
+                        else
+                                use_complex_generic=no
+                        fi
 
+                        if test "$fixed_complex_kernel" = "complex_generic_simple" ; then
+                                use_complex_generic_simple=yes
+                                found="yes"
+                        else
+                                use_complex_generic_simple=no
+                        fi
 
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+                        if test "$fixed_complex_kernel" = "complex_sse_block1" ; then
+                                use_complex_sse_block1=yes
+                                found="yes"
+                        else
+                                use_complex_sse_block1=no
+                        fi
 
+                        if test "$fixed_complex_kernel" = "complex_sse_block2" ; then
+                                use_complex_sse_block2=yes
+                                found="yes"
+                        else
+                                use_complex_sse_block2=no
+                        fi
 
-save_FCFLAGS=$FCFLAGS
-save_LDFLAGS=$LDFLAGS
+                        if test "$fixed_complex_kernel" = "complex_sse_assembly" ; then
+                                use_complex_sse_assembly=yes
+                                found="yes"
+                        else
+                                use_complex_sse_assembly=no
+                        fi
 
+                        if test "$fixed_complex_kernel" = "complex_avx_block1" ; then
+                                use_complex_avx_block1=yes
+                                found="yes"
+                        else
+                                use_complex_avx_block1=no
+                        fi
 
+                        if test "$fixed_complex_kernel" = "complex_avx_block2" ; then
+                                use_complex_avx_block2=yes
+                                found="yes"
+                        else
+                                use_complex_avx_block2=no
+                        fi
 
+                        if test "$fixed_complex_kernel" = "complex_avx2_block1" ; then
+                                use_complex_avx2_block1=yes
+                                found="yes"
+                        else
+                                use_complex_avx2_block1=no
+                        fi
 
-FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
-LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
+                        if test "$fixed_complex_kernel" = "complex_avx2_block2" ; then
+                                use_complex_avx2_block2=yes
+                                found="yes"
+                        else
+                                use_complex_avx2_block2=no
+                        fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether Fortran module iso_fortran_env is available" >&5
-$as_echo_n "checking whether Fortran module iso_fortran_env is available... " >&6; }
-cat > conftest.$ac_ext <<_ACEOF
+                        if test "$fixed_complex_kernel" = "complex_avx512_block1" ; then
+                                use_complex_avx512_block1=yes
+                                found="yes"
+                        else
+                                use_complex_avx512_block1=no
+                        fi
 
-  program test_error_unit
-    use ISO_FORTRAN_ENV, only : error_unit
-    implicit none
+                        if test "$fixed_complex_kernel" = "complex_avx512_block2" ; then
+                                use_complex_avx512_block2=yes
+                                found="yes"
+                        else
+                                use_complex_avx512_block2=no
+                        fi
 
-    write(error_unit,*) "error_unit is defined"
-  end program
+                        if test "$fixed_complex_kernel" = "complex_bgp" ; then
+                                use_complex_bgp=yes
+                                found="yes"
+                        else
+                                use_complex_bgp=no
+                        fi
+
+                        if test "$fixed_complex_kernel" = "complex_bgq" ; then
+                                use_complex_bgq=yes
+                                found="yes"
+                        else
+                                use_complex_bgq=no
+                        fi
+
+                if test x"$found" = x"no" ; then
+                        as_fn_error $? "Invalid kernel \"$fixed_complex_kernel\" specified for --with-fixed-complex-kernel" "$LINENO" 5
+                fi
+                default_complex_kernel="$fixed_complex_kernel"
+
+$as_echo "#define WITH_FIXED_COMPLEX_KERNEL 1" >>confdefs.h
+
+        fi
 
-_ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  can_use_iso_fortran_env=yes
-else
-  can_use_iso_fortran_env=no
 
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_use_iso_fortran_env}" >&5
-$as_echo "${can_use_iso_fortran_env}" >&6; }
+#AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
+#            [Compile and always use the GPU version])],
+#            [],[with_gpu_support_only=no])
+#if test x"$with_gpu_support_only" = x"yes" ; then
+#        m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
+#                      use_[]elpa_m4_kernel[]=no
+#        ])
+#        use_real_gpu=yes
+#        use_complex_gpu=yes
+#fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile a Fortran program using MKL" >&5
-$as_echo_n "checking whether we can compile a Fortran program using MKL... " >&6; }
 
 
-cat > conftest.$ac_ext <<_ACEOF
 
-  program test_mkl
-    use mkl_service
-    character*198 :: string
-    call mkl_get_version_string(string)
-    write(*,'(a)') string
-  end program
 
-_ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  can_compile_with_mkl=yes
-else
-  can_compile_with_mkl=no
+          if test x"$use_real_sparc64_block6" = x"yes"; then
 
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_with_mkl}" >&5
-$as_echo "${can_compile_with_mkl}" >&6; }
+                               if test x"$use_real_sparc64_block4" = x"no" ; then
+                                       echo "Enabling real_sparc64_block4 kernel, is a prerequisite for real_sparc64_block6"
+                               fi
+                               use_real_sparc64_block4=yes
 
-if test x"$can_compile_with_mkl" = x"yes" ; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can link a Fortran program with MKL" >&5
-$as_echo_n "checking whether we can link a Fortran program with MKL... " >&6; }
-  cat > conftest.$ac_ext <<_ACEOF
+                               if test x"$use_real_sparc64_block2" = x"no" ; then
+                                       echo "Enabling real_sparc64_block2 kernel, is a prerequisite for real_sparc64_block6"
+                               fi
+                               use_real_sparc64_block2=yes
 
-    program test_mkl
-      use mkl_service
-      character*198 :: string
-      call mkl_get_version_string(string)
-      write(*,'(a)') string
-    end program
+          fi
 
-_ACEOF
-if ac_fn_fc_try_link "$LINENO"; then :
-  have_mkl=yes
-else
-  have_mkl=no
 
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${have_mkl}" >&5
-$as_echo "${have_mkl}" >&6; }
-fi
+          if test x"$use_real_sparc64_block4" = x"yes"; then
 
-if test x"${have_mkl}" = x"yes" ; then
-  WITH_MKL=1
-else
+                               if test x"$use_real_sparc64_block2" = x"no" ; then
+                                       echo "Enabling real_sparc64_block2 kernel, is a prerequisite for real_sparc64_block4"
+                               fi
+                               use_real_sparc64_block2=yes
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing dgemm" >&5
-$as_echo_n "checking for library containing dgemm... " >&6; }
-if ${ac_cv_search_dgemm+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
-cat > conftest.$ac_ext <<_ACEOF
-      program main
-      call dgemm
-      end
-_ACEOF
-for ac_lib in '' blas; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_fc_try_link "$LINENO"; then :
-  ac_cv_search_dgemm=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_dgemm+:} false; then :
-  break
-fi
-done
-if ${ac_cv_search_dgemm+:} false; then :
+          fi
 
-else
-  ac_cv_search_dgemm=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_dgemm" >&5
-$as_echo "$ac_cv_search_dgemm" >&6; }
-ac_res=$ac_cv_search_dgemm
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
-  have_blas=yes
-else
-  have_blas=no
-fi
 
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can link a program with a blas lib" >&5
-$as_echo_n "checking whether we can link a program with a blas lib... " >&6; }
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${have_blas}" >&5
-$as_echo "${have_blas}" >&6; }
+          if test x"$use_complex_sparc64_block2" = x"yes"; then
 
-  if test x"${have_blas}" = x"no" ; then
-    as_fn_error $? "could not link with blas: specify path" "$LINENO" 5
-  fi
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing dlarrv" >&5
-$as_echo_n "checking for library containing dlarrv... " >&6; }
-if ${ac_cv_search_dlarrv+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
-cat > conftest.$ac_ext <<_ACEOF
-      program main
-      call dlarrv
-      end
-_ACEOF
-for ac_lib in '' lapack; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_fc_try_link "$LINENO"; then :
-  ac_cv_search_dlarrv=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_dlarrv+:} false; then :
-  break
-fi
-done
-if ${ac_cv_search_dlarrv+:} false; then :
+                               if test x"$use_complex_sparc64_block1" = x"no" ; then
+                                       echo "Enabling complex_sparc64_block1 kernel, is a prerequisite for complex_sparc64_block2"
+                               fi
+                               use_complex_sparc64_block1=yes
 
-else
-  ac_cv_search_dlarrv=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_dlarrv" >&5
-$as_echo "$ac_cv_search_dlarrv" >&6; }
-ac_res=$ac_cv_search_dlarrv
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
-  have_lapack=yes
-else
-  have_lapack=no
-fi
+          fi
 
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can link a program with a lapack lib" >&5
-$as_echo_n "checking whether we can link a program with a lapack lib... " >&6; }
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${have_lapack}" >&5
-$as_echo "${have_lapack}" >&6; }
 
-  if test x"${have_lapack}" = x"no" ; then
-    as_fn_error $? "could not link with lapack: specify path" "$LINENO" 5
-  fi
 
-  if test x"${with_mpi}" = x"yes"; then
-        scalapack_libs="mpiscalapack scalapack scalapack-openmpi"
-    old_LIBS="$LIBS"
-    for lib in ${scalapack_libs}; do
-      LIBS="-l${lib} ${old_LIBS}"
-      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether -l${lib} already contains a BLACS implementation" >&5
-$as_echo_n "checking whether -l${lib} already contains a BLACS implementation... " >&6; }
-      cat > conftest.$ac_ext <<_ACEOF
-      program main
-      call blacs_gridinit
-      end
-_ACEOF
-if ac_fn_fc_try_link "$LINENO"; then :
-  blacs_in_scalapack=yes
-else
-  blacs_in_scalapack=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-      { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${blacs_in_scalapack}" >&5
-$as_echo "${blacs_in_scalapack}" >&6; }
-      if test x"${blacs_in_scalapack}" = x"yes"; then
-        break
-      fi
-    done
+          if test x"$use_real_neon_arch64_block6" = x"yes"; then
 
-    if test x"${blacs_in_scalapack}" = x"no"; then
-      LIBS="${old_LIBS}"
+                               if test x"$use_real_neon_arch64_block4" = x"no" ; then
+                                       echo "Enabling real_neon_arch64_block4 kernel, is a prerequisite for real_neon_arch64_block6"
+                               fi
+                               use_real_neon_arch64_block4=yes
 
-            { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing bi_f77_init" >&5
-$as_echo_n "checking for library containing bi_f77_init... " >&6; }
-if ${ac_cv_search_bi_f77_init+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
-cat > conftest.$ac_ext <<_ACEOF
-      program main
-      call bi_f77_init
-      end
-_ACEOF
-for ac_lib in '' mpiblacsF77init; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib -lmpiblacs $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_fc_try_link "$LINENO"; then :
-  ac_cv_search_bi_f77_init=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_bi_f77_init+:} false; then :
-  break
-fi
-done
-if ${ac_cv_search_bi_f77_init+:} false; then :
+                               if test x"$use_real_neon_arch64_block2" = x"no" ; then
+                                       echo "Enabling real_neon_arch64_block2 kernel, is a prerequisite for real_neon_arch64_block6"
+                               fi
+                               use_real_neon_arch64_block2=yes
 
-else
-  ac_cv_search_bi_f77_init=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_bi_f77_init" >&5
-$as_echo "$ac_cv_search_bi_f77_init" >&6; }
-ac_res=$ac_cv_search_bi_f77_init
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+          fi
 
-fi
 
+          if test x"$use_real_neon_arch64_block4" = x"yes"; then
 
-            { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing blacs_gridinit" >&5
-$as_echo_n "checking for library containing blacs_gridinit... " >&6; }
-if ${ac_cv_search_blacs_gridinit+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
-cat > conftest.$ac_ext <<_ACEOF
-      program main
-      call blacs_gridinit
-      end
-_ACEOF
-for ac_lib in '' blacs-openmpi; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib -lblacsCinit-openmpi -lscalapack-openmpi $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_fc_try_link "$LINENO"; then :
-  ac_cv_search_blacs_gridinit=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_blacs_gridinit+:} false; then :
-  break
-fi
-done
-if ${ac_cv_search_blacs_gridinit+:} false; then :
+                               if test x"$use_real_neon_arch64_block2" = x"no" ; then
+                                       echo "Enabling real_neon_arch64_block2 kernel, is a prerequisite for real_neon_arch64_block4"
+                               fi
+                               use_real_neon_arch64_block2=yes
 
-else
-  ac_cv_search_blacs_gridinit=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_blacs_gridinit" >&5
-$as_echo "$ac_cv_search_blacs_gridinit" >&6; }
-ac_res=$ac_cv_search_blacs_gridinit
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
-  have_blacs=yes
-else
-  have_blacs=no
-fi
+          fi
 
-      if test x"${have_blacs}" = x"no"; then
-        unset ac_cv_search_blacs_gridinit
-      fi
-      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing blacs_gridinit" >&5
-$as_echo_n "checking for library containing blacs_gridinit... " >&6; }
-if ${ac_cv_search_blacs_gridinit+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
-cat > conftest.$ac_ext <<_ACEOF
-      program main
-      call blacs_gridinit
-      end
-_ACEOF
-for ac_lib in '' mpiblacs blacs; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_fc_try_link "$LINENO"; then :
-  ac_cv_search_blacs_gridinit=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_blacs_gridinit+:} false; then :
-  break
-fi
-done
-if ${ac_cv_search_blacs_gridinit+:} false; then :
 
-else
-  ac_cv_search_blacs_gridinit=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_blacs_gridinit" >&5
-$as_echo "$ac_cv_search_blacs_gridinit" >&6; }
-ac_res=$ac_cv_search_blacs_gridinit
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
-  have_blacs=yes
-else
-  have_blacs=no
-fi
+          if test x"$use_complex_neon_arch64_block2" = x"yes"; then
 
+                               if test x"$use_complex_neon_arch64_block1" = x"no" ; then
+                                       echo "Enabling complex_neon_arch64_block1 kernel, is a prerequisite for complex_neon_arch64_block2"
+                               fi
+                               use_complex_neon_arch64_block1=yes
 
-      if test x"${have_blacs}" = x"no"; then
-        as_fn_error $? "No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure" "$LINENO" 5
-      fi
-    fi
+          fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing pdtran" >&5
-$as_echo_n "checking for library containing pdtran... " >&6; }
-if ${ac_cv_search_pdtran+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
-cat > conftest.$ac_ext <<_ACEOF
-      program main
-      call pdtran
-      end
-_ACEOF
-for ac_lib in '' $scalapack_libs; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_fc_try_link "$LINENO"; then :
-  ac_cv_search_pdtran=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_pdtran+:} false; then :
-  break
-fi
-done
-if ${ac_cv_search_pdtran+:} false; then :
 
-else
-  ac_cv_search_pdtran=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_pdtran" >&5
-$as_echo "$ac_cv_search_pdtran" >&6; }
-ac_res=$ac_cv_search_pdtran
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
-  have_scalapack=yes
-else
-  have_scalapack=no
-fi
 
+          if test x"$use_real_vsx_block6" = x"yes"; then
 
-    if test x"${have_scalapack}" = x"no" ; then
-      as_fn_error $? "could not link with scalapack: specify path" "$LINENO" 5
-    fi
-  fi
+                               if test x"$use_real_vsx_block4" = x"no" ; then
+                                       echo "Enabling real_vsx_block4 kernel, is a prerequisite for real_vsx_block6"
+                               fi
+                               use_real_vsx_block4=yes
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can link a Fortran program with all blacs/scalapack" >&5
-$as_echo_n "checking whether we can link a Fortran program with all blacs/scalapack... " >&6; }
-  cat > conftest.$ac_ext <<_ACEOF
+                               if test x"$use_real_vsx_block2" = x"no" ; then
+                                       echo "Enabling real_vsx_block2 kernel, is a prerequisite for real_vsx_block6"
+                               fi
+                               use_real_vsx_block2=yes
 
-    program dgemm_test
+          fi
 
-      integer , parameter:: M = 4, N = 3, K = 2
-      real :: A(M,K), B(K,N), C(M,N)
 
-      call dgemm('N','N',M,N,K,1.0,A,M,B,K,0.0,C,M)
+          if test x"$use_real_vsx_block4" = x"yes"; then
 
-     end program dgemm_test
+                               if test x"$use_real_vsx_block2" = x"no" ; then
+                                       echo "Enabling real_vsx_block2 kernel, is a prerequisite for real_vsx_block4"
+                               fi
+                               use_real_vsx_block2=yes
 
-_ACEOF
-if ac_fn_fc_try_link "$LINENO"; then :
-  can_link_with_blacs_scalapack=yes
-else
-  can_link_with_blacs_scalapack=no
+          fi
 
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-   { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_link_with_blacs_scalapack}" >&5
-$as_echo "${can_link_with_blacs_scalapack}" >&6; }
 
-   if test x"${can_link_with_blacs_scalapack}" = x"yes" ; then
-     WITH_BLACS=1
-   else
-   as_fn_error $? "We can neither link with MKL or another Scalpack. Please specify SCALAPACK_LDFLAGS and SCALAPACK_FCFLAGS!" "$LINENO" 5
-   fi
-fi
+          if test x"$use_complex_vsx_block2" = x"yes"; then
 
-FCFLAGS=$save_FCFLAGS
-LDFLAGS=$save_LDFLAGS
+                               if test x"$use_complex_vsx_block1" = x"no" ; then
+                                       echo "Enabling complex_vsx_block1 kernel, is a prerequisite for complex_vsx_block2"
+                               fi
+                               use_complex_vsx_block1=yes
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can use the intrinsic Fortran function \"get_environment_variable\"" >&5
-$as_echo_n "checking whether we can use the intrinsic Fortran function \"get_environment_variable\"... " >&6; }
+          fi
 
 
-cat > conftest.$ac_ext <<_ACEOF
 
-  program test_get_environment
-    character(len=256) :: homedir
-    call get_environment_variable("HOME",homedir)
-  end program
+          if test x"$use_real_sse_block6" = x"yes"; then
 
-_ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
-  fortran_can_check_environment=yes
-else
-  fortran_can_check_environment=no
+                               if test x"$use_real_sse_block4" = x"no" ; then
+                                       echo "Enabling real_sse_block4 kernel, is a prerequisite for real_sse_block6"
+                               fi
+                               use_real_sse_block4=yes
 
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${fortran_can_check_environment}" >&5
-$as_echo "${fortran_can_check_environment}" >&6; }
+                               if test x"$use_real_sse_block2" = x"no" ; then
+                                       echo "Enabling real_sse_block2 kernel, is a prerequisite for real_sse_block6"
+                               fi
+                               use_real_sse_block2=yes
 
+          fi
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile with BGP intrinsics" >&5
-$as_echo_n "checking whether we can compile with BGP intrinsics... " >&6; }
+          if test x"$use_real_sse_block4" = x"yes"; then
 
+                               if test x"$use_real_sse_block2" = x"no" ; then
+                                       echo "Enabling real_sse_block2 kernel, is a prerequisite for real_sse_block4"
+                               fi
+                               use_real_sse_block2=yes
 
-cat > conftest.$ac_ext <<_ACEOF
+          fi
 
-  program test_bgp
-    complex*16 :: y3,q3,h2
-    y3 = fxcpmadd(y3,q3,h2)
 
-  end program
+          if test x"$use_complex_sse_block2" = x"yes"; then
 
-_ACEOF
-if ac_fn_fc_try_link "$LINENO"; then :
-  can_compile_bgp=yes
-else
-  can_compile_bgp=no
+                               if test x"$use_complex_sse_block1" = x"no" ; then
+                                       echo "Enabling complex_sse_block1 kernel, is a prerequisite for complex_sse_block2"
+                               fi
+                               use_complex_sse_block1=yes
 
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_bgp}" >&5
-$as_echo "${can_compile_bgp}" >&6; }
+          fi
 
-if test x"${can_compile_bgp}" = x"yes" ; then
-  install_real_bgp=yes
-  install_complex_bgp=yes
-else
-  install_real_bgp=no
-  install_complex_bgp=no
-fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile with BGQ intrinsics" >&5
-$as_echo_n "checking whether we can compile with BGQ intrinsics... " >&6; }
 
-cat > conftest.$ac_ext <<_ACEOF
+          if test x"$use_real_avx_block6" = x"yes"; then
 
-  program test_bgq
-    VECTOR(REAL(8))::QPX_h2
-    real*8         :: hh(10,2)
-    QPX_h2 = VEC_SPLATS(hh(2,2))
+                               if test x"$use_real_avx_block4" = x"no" ; then
+                                       echo "Enabling real_avx_block4 kernel, is a prerequisite for real_avx_block6"
+                               fi
+                               use_real_avx_block4=yes
 
-  end program
+                               if test x"$use_real_avx_block2" = x"no" ; then
+                                       echo "Enabling real_avx_block2 kernel, is a prerequisite for real_avx_block6"
+                               fi
+                               use_real_avx_block2=yes
 
-_ACEOF
-if ac_fn_fc_try_link "$LINENO"; then :
-  can_compile_bgq=yes
-else
-  can_compile_bgq=no
+          fi
 
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_bgq}" >&5
-$as_echo "${can_compile_bgq}" >&6; }
 
-if test x"${can_compile_bgq}" = x"yes" ; then
-  install_real_bgq=yes
-  install_complex_bgq=yes
-else
-  install_real_bgq=no
-  install_complex_bgq=no
-fi
+          if test x"$use_real_avx_block4" = x"yes"; then
 
+                               if test x"$use_real_avx_block2" = x"no" ; then
+                                       echo "Enabling real_avx_block2 kernel, is a prerequisite for real_avx_block4"
+                               fi
+                               use_real_avx_block2=yes
 
-if test x"${fortran_can_check_environment}" = x"yes" ; then
+          fi
 
-$as_echo "#define HAVE_ENVIRONMENT_CHECKING 1" >>confdefs.h
 
-fi
+          if test x"$use_complex_avx_block2" = x"yes"; then
 
+                               if test x"$use_complex_avx_block1" = x"no" ; then
+                                       echo "Enabling complex_avx_block1 kernel, is a prerequisite for complex_avx_block2"
+                               fi
+                               use_complex_avx_block1=yes
 
+          fi
 
 
 
+          if test x"$use_real_avx2_block6" = x"yes"; then
 
+                               if test x"$use_real_avx2_block4" = x"no" ; then
+                                       echo "Enabling real_avx2_block4 kernel, is a prerequisite for real_avx2_block6"
+                               fi
+                               use_real_avx2_block4=yes
 
+                               if test x"$use_real_avx2_block2" = x"no" ; then
+                                       echo "Enabling real_avx2_block2 kernel, is a prerequisite for real_avx2_block6"
+                               fi
+                               use_real_avx2_block2=yes
 
+          fi
 
 
+          if test x"$use_real_avx2_block4" = x"yes"; then
 
+                               if test x"$use_real_avx2_block2" = x"no" ; then
+                                       echo "Enabling real_avx2_block2 kernel, is a prerequisite for real_avx2_block4"
+                               fi
+                               use_real_avx2_block2=yes
 
-    use_specific_real_kernel=no
+          fi
 
 
+          if test x"$use_complex_avx2_block2" = x"yes"; then
 
-# Check whether --with-real-generic-kernel-only was given.
-if test "${with_real_generic_kernel_only+set}" = set; then :
-  withval=$with_real_generic_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+                               if test x"$use_complex_avx2_block1" = x"no" ; then
+                                       echo "Enabling complex_avx2_block1 kernel, is a prerequisite for complex_avx2_block2"
+                               fi
+                               use_complex_avx2_block1=yes
 
+          fi
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_generic=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-generic-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-generic-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-generic-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-generic-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-generic-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-generic-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+          if test x"$use_real_avx512_block6" = x"yes"; then
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                               if test x"$use_real_avx512_block4" = x"no" ; then
+                                       echo "Enabling real_avx512_block4 kernel, is a prerequisite for real_avx512_block6"
+                               fi
+                               use_real_avx512_block4=yes
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                               if test x"$use_real_avx512_block2" = x"no" ; then
+                                       echo "Enabling real_avx512_block2 kernel, is a prerequisite for real_avx512_block6"
+                               fi
+                               use_real_avx512_block2=yes
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+          fi
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+          if test x"$use_real_avx512_block4" = x"yes"; then
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-generic-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-generic-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+                               if test x"$use_real_avx512_block2" = x"no" ; then
+                                       echo "Enabling real_avx512_block2 kernel, is a prerequisite for real_avx512_block4"
+                               fi
+                               use_real_avx512_block2=yes
 
+          fi
 
 
+          if test x"$use_complex_avx512_block2" = x"yes"; then
 
-# Check whether --with-real-generic-simple-kernel-only was given.
-if test "${with_real_generic_simple_kernel_only+set}" = set; then :
-  withval=$with_real_generic_simple_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+                               if test x"$use_complex_avx512_block1" = x"no" ; then
+                                       echo "Enabling complex_avx512_block1 kernel, is a prerequisite for complex_avx512_block2"
+                               fi
+                               use_complex_avx512_block1=yes
 
+          fi
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_generic_simple=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-simple-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-generic-simple-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-simple-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-generic-simple-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-simple-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-generic-simple-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-simple-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-generic-simple-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-simple-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-generic-simple-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-simple-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-generic-simple-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+        need_generic=no
+        need_generic_kernels=""
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                if test x"$use_real_generic" = x"yes" ; then
+                        need_generic=yes
+                        need_generic_kernels="$need_generic_kernels real_generic"
+                fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_real_generic_simple" = x"yes" ; then
+                        need_generic=yes
+                        need_generic_kernels="$need_generic_kernels real_generic_simple"
+                fi
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_real_generic_simple_block4" = x"yes" ; then
+                        need_generic=yes
+                        need_generic_kernels="$need_generic_kernels real_generic_simple_block4"
+                fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_real_generic_simple_block6" = x"yes" ; then
+                        need_generic=yes
+                        need_generic_kernels="$need_generic_kernels real_generic_simple_block6"
+                fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-generic-simple-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-generic-simple-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-generic-simple-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+                if test x"$use_complex_generic" = x"yes" ; then
+                        need_generic=yes
+                        need_generic_kernels="$need_generic_kernels complex_generic"
+                fi
 
+                if test x"$use_complex_generic_simple" = x"yes" ; then
+                        need_generic=yes
+                        need_generic_kernels="$need_generic_kernels complex_generic_simple"
+                fi
 
 
+        need_sparc64=no
+        need_sparc64_kernels=""
 
-# Check whether --with-real-sse-assembly-kernel-only was given.
-if test "${with_real_sse_assembly_kernel_only+set}" = set; then :
-  withval=$with_real_sse_assembly_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+                if test x"$use_real_sparc64_block2" = x"yes" ; then
+                        need_sparc64=yes
+                        need_sparc64_kernels="$need_sparc64_kernels real_sparc64_block2"
+                fi
 
+                if test x"$use_real_sparc64_block4" = x"yes" ; then
+                        need_sparc64=yes
+                        need_sparc64_kernels="$need_sparc64_kernels real_sparc64_block4"
+                fi
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_sse_assembly=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-assembly-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-sse-assembly-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-assembly-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-sse-assembly-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-assembly-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-sse-assembly-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-assembly-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-sse-assembly-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-assembly-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-sse-assembly-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-assembly-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-sse-assembly-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
+                if test x"$use_real_sparc64_block6" = x"yes" ; then
+                        need_sparc64=yes
+                        need_sparc64_kernels="$need_sparc64_kernels real_sparc64_block6"
+                fi
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+        need_neon_arch64=no
+        need_neon_arch64_kernels=""
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                if test x"$use_real_neon_arch64_block2" = x"yes" ; then
+                        need_neon_arch64=yes
+                        need_neon_arch64_kernels="$need_neon_arch64_kernels real_neon_arch64_block2"
+                fi
+
+                if test x"$use_real_neon_arch64_block4" = x"yes" ; then
+                        need_neon_arch64=yes
+                        need_neon_arch64_kernels="$need_neon_arch64_kernels real_neon_arch64_block4"
+                fi
+
+                if test x"$use_real_neon_arch64_block6" = x"yes" ; then
+                        need_neon_arch64=yes
+                        need_neon_arch64_kernels="$need_neon_arch64_kernels real_neon_arch64_block6"
+                fi
+
+
+        need_vsx=no
+        need_vsx_kernels=""
+
+                if test x"$use_real_vsx_block2" = x"yes" ; then
+                        need_vsx=yes
+                        need_vsx_kernels="$need_vsx_kernels real_vsx_block2"
+                fi
+
+                if test x"$use_real_vsx_block4" = x"yes" ; then
+                        need_vsx=yes
+                        need_vsx_kernels="$need_vsx_kernels real_vsx_block4"
+                fi
+
+                if test x"$use_real_vsx_block6" = x"yes" ; then
+                        need_vsx=yes
+                        need_vsx_kernels="$need_vsx_kernels real_vsx_block6"
+                fi
+
+
+        need_sse=no
+        need_sse_kernels=""
+
+                if test x"$use_real_sse_block2" = x"yes" ; then
+                        need_sse=yes
+                        need_sse_kernels="$need_sse_kernels real_sse_block2"
+                fi
+
+                if test x"$use_real_sse_block4" = x"yes" ; then
+                        need_sse=yes
+                        need_sse_kernels="$need_sse_kernels real_sse_block4"
+                fi
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                if test x"$use_real_sse_block6" = x"yes" ; then
+                        need_sse=yes
+                        need_sse_kernels="$need_sse_kernels real_sse_block6"
+                fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_complex_sse_block1" = x"yes" ; then
+                        need_sse=yes
+                        need_sse_kernels="$need_sse_kernels complex_sse_block1"
+                fi
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_complex_sse_block2" = x"yes" ; then
+                        need_sse=yes
+                        need_sse_kernels="$need_sse_kernels complex_sse_block2"
+                fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-assembly-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-sse-assembly-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-sse-assembly-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+        need_sse_assembly=no
+        need_sse_assembly_kernels=""
 
+                if test x"$use_real_sse_assembly" = x"yes" ; then
+                        need_sse_assembly=yes
+                        need_sse_assembly_kernels="$need_sse_assembly_kernels real_sse_assembly"
+                fi
 
+                if test x"$use_complex_sse_assembly" = x"yes" ; then
+                        need_sse_assembly=yes
+                        need_sse_assembly_kernels="$need_sse_assembly_kernels complex_sse_assembly"
+                fi
 
 
-# Check whether --with-real-bgp-kernel-only was given.
-if test "${with_real_bgp_kernel_only+set}" = set; then :
-  withval=$with_real_bgp_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+        need_avx=no
+        need_avx_kernels=""
 
+                if test x"$use_real_avx_block2" = x"yes" ; then
+                        need_avx=yes
+                        need_avx_kernels="$need_avx_kernels real_avx_block2"
+                fi
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_bgp=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgp-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-bgp-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgp-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-bgp-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgp-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-bgp-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgp-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-bgp-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgp-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-bgp-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgp-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-bgp-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
+                if test x"$use_real_avx_block4" = x"yes" ; then
+                        need_avx=yes
+                        need_avx_kernels="$need_avx_kernels real_avx_block4"
+                fi
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+                if test x"$use_real_avx_block6" = x"yes" ; then
+                        need_avx=yes
+                        need_avx_kernels="$need_avx_kernels real_avx_block6"
+                fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                if test x"$use_complex_avx_block1" = x"yes" ; then
+                        need_avx=yes
+                        need_avx_kernels="$need_avx_kernels complex_avx_block1"
+                fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                if test x"$use_complex_avx_block2" = x"yes" ; then
+                        need_avx=yes
+                        need_avx_kernels="$need_avx_kernels complex_avx_block2"
+                fi
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        need_avx2=no
+        need_avx2_kernels=""
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_real_avx2_block2" = x"yes" ; then
+                        need_avx2=yes
+                        need_avx2_kernels="$need_avx2_kernels real_avx2_block2"
+                fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_real_avx2_block4" = x"yes" ; then
+                        need_avx2=yes
+                        need_avx2_kernels="$need_avx2_kernels real_avx2_block4"
+                fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgp-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-bgp-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-bgp-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+                if test x"$use_real_avx2_block6" = x"yes" ; then
+                        need_avx2=yes
+                        need_avx2_kernels="$need_avx2_kernels real_avx2_block6"
+                fi
 
+                if test x"$use_complex_avx2_block1" = x"yes" ; then
+                        need_avx2=yes
+                        need_avx2_kernels="$need_avx2_kernels complex_avx2_block1"
+                fi
 
+                if test x"$use_complex_avx2_block2" = x"yes" ; then
+                        need_avx2=yes
+                        need_avx2_kernels="$need_avx2_kernels complex_avx2_block2"
+                fi
 
 
-# Check whether --with-real-bgq-kernel-only was given.
-if test "${with_real_bgq_kernel_only+set}" = set; then :
-  withval=$with_real_bgq_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+        need_avx512=no
+        need_avx512_kernels=""
 
+                if test x"$use_real_avx512_block2" = x"yes" ; then
+                        need_avx512=yes
+                        need_avx512_kernels="$need_avx512_kernels real_avx512_block2"
+                fi
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_bgq=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgq-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-bgq-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgq-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-bgq-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgq-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-bgq-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgq-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-bgq-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgq-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-bgq-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgq-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-bgq-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
+                if test x"$use_real_avx512_block4" = x"yes" ; then
+                        need_avx512=yes
+                        need_avx512_kernels="$need_avx512_kernels real_avx512_block4"
+                fi
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+                if test x"$use_real_avx512_block6" = x"yes" ; then
+                        need_avx512=yes
+                        need_avx512_kernels="$need_avx512_kernels real_avx512_block6"
+                fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                if test x"$use_complex_avx512_block1" = x"yes" ; then
+                        need_avx512=yes
+                        need_avx512_kernels="$need_avx512_kernels complex_avx512_block1"
+                fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+                if test x"$use_complex_avx512_block2" = x"yes" ; then
+                        need_avx512=yes
+                        need_avx512_kernels="$need_avx512_kernels complex_avx512_block2"
+                fi
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        need_bgp=no
+        need_bgp_kernels=""
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_real_bgp" = x"yes" ; then
+                        need_bgp=yes
+                        need_bgp_kernels="$need_bgp_kernels real_bgp"
+                fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                if test x"$use_complex_bgp" = x"yes" ; then
+                        need_bgp=yes
+                        need_bgp_kernels="$need_bgp_kernels complex_bgp"
+                fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-bgq-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-bgq-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-bgq-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
 
+        need_bgq=no
+        need_bgq_kernels=""
 
+                if test x"$use_real_bgq" = x"yes" ; then
+                        need_bgq=yes
+                        need_bgq_kernels="$need_bgq_kernels real_bgq"
+                fi
 
+                if test x"$use_complex_bgq" = x"yes" ; then
+                        need_bgq=yes
+                        need_bgq_kernels="$need_bgq_kernels complex_bgq"
+                fi
 
-# Check whether --with-real-sse-block2-kernel-only was given.
-if test "${with_real_sse_block2_kernel_only+set}" = set; then :
-  withval=$with_real_sse_block2_kernel_only; with_option=yes
-else
-  with_option=no
-fi
 
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_sse_block2=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block2-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block2-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block2-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block2-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block2-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block2-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block2-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-sse-block2-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block2-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block2-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block2-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-sse-block2-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
+        if test x"$need_generic" = x"yes" ; then
+               echo "Using GENERIC for kernels$need_generic_kernels"
+        fi
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "real-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+        if test x"$need_sparc64" = x"yes" ; then
+               echo "Using SPARC64 for kernels$need_sparc64_kernels"
+        fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+        if test x"$need_neon_arch64" = x"yes" ; then
+               echo "Using NEON_ARCH64 for kernels$need_neon_arch64_kernels"
+        fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+        if test x"$need_vsx" = x"yes" ; then
+               echo "Using VSX for kernels$need_vsx_kernels"
+        fi
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+        if test x"$need_sse" = x"yes" ; then
+               echo "Using SSE for kernels$need_sse_kernels"
+        fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        if test x"$need_sse_assembly" = x"yes" ; then
+               echo "Using SSE_ASSEMBLY for kernels$need_sse_assembly_kernels"
+        fi
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        if test x"$need_avx" = x"yes" ; then
+               echo "Using AVX for kernels$need_avx_kernels"
+        fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        if test x"$need_avx2" = x"yes" ; then
+               echo "Using AVX2 for kernels$need_avx2_kernels"
+        fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block2-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-sse-block2-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-sse-block2-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+        if test x"$need_avx512" = x"yes" ; then
+               echo "Using AVX512 for kernels$need_avx512_kernels"
+        fi
 
+        if test x"$need_bgp" = x"yes" ; then
+               echo "Using BGP for kernels$need_bgp_kernels"
+        fi
 
+        if test x"$need_bgq" = x"yes" ; then
+               echo "Using BGQ for kernels$need_bgq_kernels"
+        fi
 
 
-# Check whether --with-real-sse-block4-kernel-only was given.
-if test "${with_real_sse_block4_kernel_only+set}" = set; then :
-  withval=$with_real_sse_block4_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+        if test x"$need_generic" != x"yes" ; then
+               echo "Not using GENERIC as no selected kernel needs it"
+        fi
 
+        if test x"$need_sparc64" != x"yes" ; then
+               echo "Not using SPARC64 as no selected kernel needs it"
+        fi
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_sse_block4=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block4-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block4-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block4-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block4-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block4-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block4-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block4-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-sse-block4-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block4-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block4-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block4-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-sse-block4-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
+        if test x"$need_neon_arch64" != x"yes" ; then
+               echo "Not using NEON_ARCH64 as no selected kernel needs it"
+        fi
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "real-sse-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+        if test x"$need_vsx" != x"yes" ; then
+               echo "Not using VSX as no selected kernel needs it"
+        fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+        if test x"$need_sse" != x"yes" ; then
+               echo "Not using SSE as no selected kernel needs it"
+        fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+        if test x"$need_sse_assembly" != x"yes" ; then
+               echo "Not using SSE_ASSEMBLY as no selected kernel needs it"
+        fi
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+        if test x"$need_avx" != x"yes" ; then
+               echo "Not using AVX as no selected kernel needs it"
+        fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        if test x"$need_avx2" != x"yes" ; then
+               echo "Not using AVX2 as no selected kernel needs it"
+        fi
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        if test x"$need_avx512" != x"yes" ; then
+               echo "Not using AVX512 as no selected kernel needs it"
+        fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        if test x"$need_bgp" != x"yes" ; then
+               echo "Not using BGP as no selected kernel needs it"
+        fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block4-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-sse-block4-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-sse-block4-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+        if test x"$need_bgq" != x"yes" ; then
+               echo "Not using BGQ as no selected kernel needs it"
+        fi
 
 
 
 
-# Check whether --with-real-sse-block6-kernel-only was given.
-if test "${with_real_sse_block6_kernel_only+set}" = set; then :
-  withval=$with_real_sse_block6_kernel_only; with_option=yes
-else
-  with_option=no
-fi
 
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_sse_block6=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block6-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block6-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block6-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block6-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block6-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block6-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block6-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-sse-block6-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block6-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-sse-block6-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block6-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-sse-block6-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
+# Check whether --with-default-real-kernel was given.
+if test "${with_default_real_kernel+set}" = set; then :
+  withval=$with_default_real_kernel; default_real_kernel="real_$withval"
+else
+  default_real_kernel=""
+fi
+
+        #if test -n "$default_[]elpa_m4_kind[]_kernel" ; then
+        #        found="no"
+        #        m4_foreach_w([elpa_m4_otherkernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[
+        #                if test "$default_]elpa_m4_kind[_kernel" = "]elpa_m4_otherkernel[" ; then
+        #                        use_[]elpa_m4_otherkernel[]=yes
+        #                        found="yes"
+        #                else
+        #                        use_[]elpa_m4_otherkernel[]=no
+        #                fi
+        #        ])
+        #        if test x"$found" = x"no" ; then
+        #                AC_MSG_ERROR([Invalid kernel "$default_]elpa_m4_kind[_kernel" specified for --with-default-]elpa_m4_kind[-kernel])
+        #        fi
+        #        AC_DEFINE([WITH_DEFAULT_]m4_toupper(elpa_m4_kind)[_KERNEL],[1],[use specific ]elpa_m4_kind[ default kernel (set at compile time)])
+        #fi
+
+
+# Check whether --with-default-complex-kernel was given.
+if test "${with_default_complex_kernel+set}" = set; then :
+  withval=$with_default_complex_kernel; default_complex_kernel="complex_$withval"
+else
+  default_complex_kernel=""
+fi
+
+        #if test -n "$default_[]elpa_m4_kind[]_kernel" ; then
+        #        found="no"
+        #        m4_foreach_w([elpa_m4_otherkernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[
+        #                if test "$default_]elpa_m4_kind[_kernel" = "]elpa_m4_otherkernel[" ; then
+        #                        use_[]elpa_m4_otherkernel[]=yes
+        #                        found="yes"
+        #                else
+        #                        use_[]elpa_m4_otherkernel[]=no
+        #                fi
+        #        ])
+        #        if test x"$found" = x"no" ; then
+        #                AC_MSG_ERROR([Invalid kernel "$default_]elpa_m4_kind[_kernel" specified for --with-default-]elpa_m4_kind[-kernel])
+        #        fi
+        #        AC_DEFINE([WITH_DEFAULT_]m4_toupper(elpa_m4_kind)[_KERNEL],[1],[use specific ]elpa_m4_kind[ default kernel (set at compile time)])
+        #fi
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "real-sse-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "real-sse-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx512_block2" = x"yes"; then
+                                     default_real_kernel="real_avx512_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx512_block4" = x"yes"; then
+                                     default_real_kernel="real_avx512_block4"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx512_block6" = x"yes"; then
+                                     default_real_kernel="real_avx512_block6"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx2_block2" = x"yes"; then
+                                     default_real_kernel="real_avx2_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx2_block4" = x"yes"; then
+                                     default_real_kernel="real_avx2_block4"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx2_block6" = x"yes"; then
+                                     default_real_kernel="real_avx2_block6"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx_block2" = x"yes"; then
+                                     default_real_kernel="real_avx_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx_block4" = x"yes"; then
+                                     default_real_kernel="real_avx_block4"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_avx_block6" = x"yes"; then
+                                     default_real_kernel="real_avx_block6"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_sse_block2" = x"yes"; then
+                                     default_real_kernel="real_sse_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_sse_block4" = x"yes"; then
+                                     default_real_kernel="real_sse_block4"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_sse_block6" = x"yes"; then
+                                     default_real_kernel="real_sse_block6"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_sse_assembly" = x"yes"; then
+                                     default_real_kernel="real_sse_assembly"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_sparc64_block2" = x"yes"; then
+                                     default_real_kernel="real_sparc64_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_sparc64_block4" = x"yes"; then
+                                     default_real_kernel="real_sparc64_block4"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_sparc64_block6" = x"yes"; then
+                                     default_real_kernel="real_sparc64_block6"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_neon_arch64_block2" = x"yes"; then
+                                     default_real_kernel="real_neon_arch64_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_neon_arch64_block4" = x"yes"; then
+                                     default_real_kernel="real_neon_arch64_block4"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_neon_arch64_block6" = x"yes"; then
+                                     default_real_kernel="real_neon_arch64_block6"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_vsx_block2" = x"yes"; then
+                                     default_real_kernel="real_vsx_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_vsx_block4" = x"yes"; then
+                                     default_real_kernel="real_vsx_block4"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_vsx_block6" = x"yes"; then
+                                     default_real_kernel="real_vsx_block6"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_generic" = x"yes"; then
+                                     default_real_kernel="real_generic"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_generic_simple" = x"yes"; then
+                                     default_real_kernel="real_generic_simple"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_generic_simple_block4" = x"yes"; then
+                                     default_real_kernel="real_generic_simple_block4"
+                             fi
+                     fi
+
+                     if test -z "$default_real_kernel"; then
+                             if test x"$use_real_generic_simple_block6" = x"yes"; then
+                                     default_real_kernel="real_generic_simple_block6"
+                             fi
+                     fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-sse-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+        if test -z "$default_real_kernel"; then
+                as_fn_error $? "Internal error, could not determine a default kernel" "$LINENO" 5
+        fi
+        # find the number of this kernel
+        ELPA_2STAGE_REAL_DEFAULT=`grep -i '^ *X(ELPA_2STAGE_'$default_real_kernel'\>' $srcdir/elpa/elpa_constants.h.in | \
+                                                                perl -pe 's/^[^,]*, *//; s/,.*//;'`
+
+
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_avx512_block1" = x"yes"; then
+                                     default_complex_kernel="complex_avx512_block1"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_avx512_block2" = x"yes"; then
+                                     default_complex_kernel="complex_avx512_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_avx2_block1" = x"yes"; then
+                                     default_complex_kernel="complex_avx2_block1"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_avx2_block2" = x"yes"; then
+                                     default_complex_kernel="complex_avx2_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_avx_block1" = x"yes"; then
+                                     default_complex_kernel="complex_avx_block1"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_avx_block2" = x"yes"; then
+                                     default_complex_kernel="complex_avx_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_sse_block1" = x"yes"; then
+                                     default_complex_kernel="complex_sse_block1"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_sse_block2" = x"yes"; then
+                                     default_complex_kernel="complex_sse_block2"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_sse_assembly" = x"yes"; then
+                                     default_complex_kernel="complex_sse_assembly"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_generic" = x"yes"; then
+                                     default_complex_kernel="complex_generic"
+                             fi
+                     fi
+
+                     if test -z "$default_complex_kernel"; then
+                             if test x"$use_complex_generic_simple" = x"yes"; then
+                                     default_complex_kernel="complex_generic_simple"
+                             fi
+                     fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-sse-block6-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-sse-block6-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-sse-block6-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+        if test -z "$default_complex_kernel"; then
+                as_fn_error $? "Internal error, could not determine a default kernel" "$LINENO" 5
+        fi
+        # find the number of this kernel
+        ELPA_2STAGE_COMPLEX_DEFAULT=`grep -i '^ *X(ELPA_2STAGE_'$default_complex_kernel'\>' $srcdir/elpa/elpa_constants.h.in | \
+                                                                perl -pe 's/^[^,]*, *//; s/,.*//;'`
 
 
 
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-# Check whether --with-real-avx-block2-kernel-only was given.
-if test "${with_real_avx_block2_kernel_only+set}" = set; then :
-  withval=$with_real_avx_block2_kernel_only; with_option=yes
-else
-  with_option=no
-fi
 
+if test x"${need_vsx}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile Altivec VSX with intrinsics in C" >&5
+$as_echo_n "checking whether we can compile Altivec VSX with intrinsics in C... " >&6; }
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_avx_block2=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block2-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block2-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block2-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block2-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block2-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block2-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block2-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-avx-block2-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block2-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block2-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block2-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-avx-block2-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
+#include <altivec.h>
+int main(int argc, char **argv) {
+  __vector double a, b, c;
+  c = vec_add(a,b);
+  return 0;
+}
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "real-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_vsx=yes
+else
+  can_compile_vsx=no
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_vsx}" >&5
+$as_echo "${can_compile_vsx}" >&6; }
+  if test x"$can_compile_vsx" != x"yes"; then
+    as_fn_error $? "Could not compile test program, try with --disable-vsx, or adjust the C compiler or CFLAGS" "$LINENO" 5
+  fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+$as_echo "#define HAVE_VSX_SSE 1" >>confdefs.h
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+if test x"${need_sparc64}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile SPARC64 with intrinsics in C" >&5
+$as_echo_n "checking whether we can compile SPARC64 with intrinsics in C... " >&6; }
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+#include <fjmfunc.h>
+#include <emmintrin.h>
+int main(int argc, char **argv) {
+  __m128d  tau1;
+  __m128d h1 = _fjsp_neg_v2r8(tau1);
+  return 0;
+}
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_sparc64=yes
+else
+  can_compile_sparc64=no
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block2-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-avx-block2-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-avx-block2-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_sparc64}" >&5
+$as_echo "${can_compile_sparc64}" >&6; }
+  if test x"$can_compile_sparc64" != x"yes"; then
+    as_fn_error $? "Could not compile test program, try with --disable-sparc64, or adjust the C compiler or CFLAGS" "$LINENO" 5
   fi
 
+$as_echo "#define HAVE_SPARC64_SSE 1" >>confdefs.h
+
+fi
+
+if test x"${need_neon_arch64}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile NEON ARCH64 with intrinsics in C" >&5
+$as_echo_n "checking whether we can compile NEON ARCH64 with intrinsics in C... " >&6; }
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+#include <arm_neon.h>
+int main(int argc, char **argv) {
+  __Float64x2_t  x1, x2, x3, x4;
+  x4 = vfmaq_f64(x1, x2, x3);
+  return 0;
+}
 
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_neon_arch64=yes
+else
+  can_compile_neon_arch64=no
 
-# Check whether --with-real-avx-block4-kernel-only was given.
-if test "${with_real_avx_block4_kernel_only+set}" = set; then :
-  withval=$with_real_avx_block4_kernel_only; with_option=yes
-else
-  with_option=no
 fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_neon_arch64}" >&5
+$as_echo "${can_compile_neon_arch64}" >&6; }
+  if test x"$can_compile_neon_arch64" != x"yes"; then
+    as_fn_error $? "Could not compile test program, try with --disable-neon_arch64, or adjust the C compiler or CFLAGS" "$LINENO" 5
+  fi
 
+$as_echo "#define HAVE_NEON_ARCH64_SSE 1" >>confdefs.h
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_avx_block4=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block4-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block4-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block4-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block4-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block4-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block4-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block4-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-avx-block4-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block4-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block4-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block4-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-avx-block4-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
-
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "real-avx-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+fi
 
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+if test x"${need_sse}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile SSE3 with gcc intrinsics in C" >&5
+$as_echo_n "checking whether we can compile SSE3 with gcc intrinsics in C... " >&6; }
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+#include <x86intrin.h>
+int main(int argc, char **argv) {
+  double* q;
+  __m128d h1 = _mm_loaddup_pd(q);
+  return 0;
+}
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_sse=yes
+else
+  can_compile_sse=no
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_sse}" >&5
+$as_echo "${can_compile_sse}" >&6; }
+  if test x"$can_compile_sse" != x"yes"; then
+    as_fn_error $? "Could not compile test program, try with --disable-sse, or adjust the C compiler or CFLAGS" "$LINENO" 5
+  fi
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block4-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+$as_echo "#define HAVE_SSE_INTRINSICS 1" >>confdefs.h
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block4-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-avx-block4-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-avx-block4-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+fi
 
 
+if test x"${need_sse_assembly}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether double-precision SSE assembly kernels can be compiled" >&5
+$as_echo_n "checking whether double-precision SSE assembly kernels can be compiled... " >&6; }
 
+  $CC $CFLAGS -c $srcdir/src/elpa2/kernels/asm_x86_64_double_precision.s -o conftest.o 2>&5
 
-# Check whether --with-real-avx-block6-kernel-only was given.
-if test "${with_real_avx_block6_kernel_only+set}" = set; then :
-  withval=$with_real_avx_block6_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+  if test "$?" == 0; then
+    can_compile_sse_asm_double=yes
+  else
+    can_compile_sse_asm_double=no
+  fi
+  rm -f ./conftest.o
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_sse_asm_double}" >&5
+$as_echo "${can_compile_sse_asm_double}" >&6; }
+  if test x"$can_compile_sse_asm_double" != x"yes"; then
+    as_fn_error $? "Could not compile test program, try with --disable-sse-assembly, or adjust the C compiler or CFLAGS. Possibly (some of) the flags \" $SIMD_FLAGS \" solve this issue" "$LINENO" 5
+  fi
 
+  if test x"${want_single_precision}" = x"yes" ; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether single-precision SSE assembly kernels can be compiled" >&5
+$as_echo_n "checking whether single-precision SSE assembly kernels can be compiled... " >&6; }
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-        install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-        install_real_avx_block6=yes
-        if test x"${install_real_sse_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block6-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block6-kernel-only set. Also sse_block2 is needed" >&6;}
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block6-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block6-kernel-only set. Also avx_block2 is needed" >&6;}
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block6-kernel-only set. Also sse_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block6-kernel-only set. Also sse_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block6-kernel-only set. Also sse_block4 is needed" >&5
-$as_echo "$as_me: real-avx-block6-kernel-only set. Also sse_block4 is needed" >&6;}
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block6-kernel-only set. Also avx_block2 is needed" >&5
-$as_echo "$as_me: real-avx-block6-kernel-only set. Also avx_block2 is needed" >&6;}
-      { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block6-kernel-only set. Also avx_block4 is needed" >&5
-$as_echo "$as_me: real-avx-block6-kernel-only set. Also avx_block4 is needed" >&6;}
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
+    $CC $CFLAGS -c $srcdir/src/elpa2/kernels/asm_x86_64_single_precision.s -o conftest.o 2>&5
 
-        if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "real-avx-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
+    if test "$?" == 0; then
+      can_compile_sse_asm_single=yes
+    else
+      can_compile_sse_asm_single=no
     fi
-
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
+    rm -f ./conftest.o
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_sse_asm_single}" >&5
+$as_echo "${can_compile_sse_asm_single}" >&6; }
+    if test x"$can_compile_sse_asm_single" != x"yes"; then
+      as_fn_error $? "Could not compile test program, try with --disable-sse-assembly, or adjust the C compiler or CFLAGS. Possibly (some of) the flags \" $SIMD_FLAGS \" solve this issue" "$LINENO" 5
     fi
+  fi
+fi
 
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       as_fn_error $? "real-avx-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+if test x"${need_avx}" = x"yes"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX gcc intrinsics in C" >&5
+$as_echo_n "checking whether we can compile AVX gcc intrinsics in C... " >&6; }
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+   #include <x86intrin.h>
+   int main(int argc, char **argv){
+   double* q;
+   __m256d a1_1 = _mm256_load_pd(q);
+   return 0;
+   }
 
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "real-avx-block6-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_avx=yes
+else
+  can_compile_avx=no
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: real-avx-block6-kernel-only will be the only compiled kernel for real case" >&5
-$as_echo "$as_me: real-avx-block6-kernel-only will be the only compiled kernel for real case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "real-avx-block6-kernel-only failed; A specific kernel for real case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_avx}" >&5
+$as_echo "${can_compile_avx}" >&6; }
+  if test x"$can_compile_avx" != x"yes"; then
+    as_fn_error $? "Could not compile a test program with AVX, try with --disable-avx, or adjust the C compiler or CFLAGS. Possibly (some of) the flags \" $SIMD_FLAGS \" solve this issue" "$LINENO" 5
   fi
 
+$as_echo "#define HAVE_AVX 1" >>confdefs.h
 
+fi
 
 
-    use_specific_complex_kernel=no
-
+if test x"${need_avx2}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 gcc intrinsics in C" >&5
+$as_echo_n "checking whether we can compile AVX2 gcc intrinsics in C... " >&6; }
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+   #include <x86intrin.h>
+   int main(int argc, char **argv){
+   double* q;
+   __m256d q1 = _mm256_load_pd(q);
+   __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
+   return 0;
+   }
 
-# Check whether --with-complex-generic-kernel-only was given.
-if test "${with_complex_generic_kernel_only+set}" = set; then :
-  withval=$with_complex_generic_kernel_only; with_option=yes
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_avx2=yes
 else
-  with_option=no
+  can_compile_avx2=no
+
 fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_avx2}" >&5
+$as_echo "${can_compile_avx2}" >&6; }
+  if test x"$can_compile_avx2" != x"yes"; then
+    as_fn_error $? "Could not compile a test program with AVX2, try with --disable-avx2, or adjust the C compiler or CFLAGS.  Possibly (some of) the flags \" $SIMD_FLAGS \" solve this issue" "$LINENO" 5
+  fi
 
+$as_echo "#define HAVE_AVX2 1" >>confdefs.h
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
+fi
 
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
 
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_generic=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
+if test x"${need_avx512}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 gcc intrinsics in C" >&5
+$as_echo_n "checking whether we can compile AVX512 gcc intrinsics in C... " >&6; }
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+   #include <x86intrin.h>
+   int main(int argc, char **argv){
+   double* q;
+   __m512d q1 = _mm512_load_pd(q);
+   __m512d y1 = _mm512_fmadd_pd(q1, q1, q1);
+   return 0;
+   }
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_avx512=yes
+else
+  can_compile_avx512=no
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_avx512}" >&5
+$as_echo "${can_compile_avx512}" >&6; }
+  if test x"$can_compile_avx512" != x"yes"; then
+    as_fn_error $? "Could not compile a test program with AVX512, adjust the C compiler or CFLAGS. Possibly (some of) the flags \" $SIMD_FLAGS \" solve this issue" "$LINENO" 5
+  fi
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+$as_echo "#define HAVE_AVX512 1" >>confdefs.h
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-generic-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-generic-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-generic-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
 
+  if test x"$can_compile_avx512" = x"yes"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we compile for Xeon" >&5
+$as_echo_n "checking whether we compile for Xeon... " >&6; }
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+     #include <x86intrin.h>
+     int main(int argc, char **argv){
+     __m512d sign;
+     __m512d h1_real;
+
+     __m512d x1 = _mm512_xor_pd(h1_real, sign);
+     return 0;
+     }
 
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_avx512_xeon=yes
+else
+  can_compile_avx512_xeon=no
 
-# Check whether --with-complex-generic-simple-kernel-only was given.
-if test "${with_complex_generic_simple_kernel_only+set}" = set; then :
-  withval=$with_complex_generic_simple_kernel_only; with_option=yes
-else
-  with_option=no
 fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_avx512_xeon}" >&5
+$as_echo "${can_compile_avx512_xeon}" >&6; }
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we compile for Xeon PHI" >&5
+$as_echo_n "checking whether we compile for Xeon PHI... " >&6; }
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_generic_simple=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
-
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+     #include <x86intrin.h>
+     int main(int argc, char **argv){
+     __m512d sign;
+     __m512d h1;
+     __m512d h2_real;
+
+     __m512d x1 = (__m512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
+     return 0;
+     }
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  can_compile_avx512_xeon_phi=yes
+else
+  can_compile_avx512_xeon_phi=no
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_avx512_xeon_phi}" >&5
+$as_echo "${can_compile_avx512_xeon_phi}" >&6; }
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "generic-simple-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
+    # this is needed for the intel compiler
+    if test x"$can_compile_avx512_xeon" = x"yes" ; then
+      if test x"$can_compile_avx512_xeon_phi" = x"yes" ; then
+        # we want only one to be true; this is ugly but could not come up with a better way
+        grep Phi /proc/cpuinfo > /dev/null
+        if test x"$?" = x"0" ; then
+	  echo "Xeon PHI found ... disabling AVX512 Xeon"
+          can_compile_avx512_xeon=no
+        fi
+      fi
     fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-generic-simple-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-generic-simple-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-generic-simple-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+    if test x"$can_compile_avx512_xeon" = x"yes"; then
 
+$as_echo "#define HAVE_AVX512_XEON 1" >>confdefs.h
 
+    else
+      if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
 
+$as_echo "#define HAVE_AVX512_XEON_PHI 1" >>confdefs.h
 
-# Check whether --with-complex-sse-assembly-kernel-only was given.
-if test "${with_complex_sse_assembly_kernel_only+set}" = set; then :
-  withval=$with_complex_sse_assembly_kernel_only; with_option=yes
-else
-  with_option=no
+      else
+        as_fn_error $? "Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!" "$LINENO" 5
+      fi
+    fi
+  fi
 fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_sse_assembly=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
 
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+if test x"${need_bgp}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile with BGP intrinsics" >&5
+$as_echo_n "checking whether we can compile with BGP intrinsics... " >&6; }
+  cat > conftest.$ac_ext <<_ACEOF
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+    program test_bgp
+      complex*16 :: y3,q3,h2
+      y3 = fxcpmadd(y3,q3,h2)
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "sse-assembly-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+    end program
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-sse-assembly-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-sse-assembly-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-sse-assembly-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+  can_compile_bgp=yes
+else
+  can_compile_bgp=no
 
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_bgp}" >&5
+$as_echo "${can_compile_bgp}" >&6; }
+  if test x"$can_compile_bgp" != x"yes"; then
+    as_fn_error $? "Could not compile a test program with BGP intrinsics, adjust the FC compiler or FCFLAGS" "$LINENO" 5
+  fi
+fi
 
 
+if test x"${need_bgq}" = x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile with BGQ intrinsics" >&5
+$as_echo_n "checking whether we can compile with BGQ intrinsics... " >&6; }
+  cat > conftest.$ac_ext <<_ACEOF
 
-# Check whether --with-complex-bgp-kernel-only was given.
-if test "${with_complex_bgp_kernel_only+set}" = set; then :
-  withval=$with_complex_bgp_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+    program test_bgq
+      VECTOR(REAL(8))::QPX_h2
+      real*8         :: hh(10,2)
+      QPX_h2 = VEC_SPLATS(hh(2,2))
 
+    end program
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_bgp=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+  can_compile_bgq=yes
+else
+  can_compile_bgq=no
 
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${can_compile_bgq}" >&5
+$as_echo "${can_compile_bgq}" >&6; }
+  if test x"$can_compile_bgq" != x"yes"; then
+    as_fn_error $? "Could not compile a test program with BGQ intrinsics, adjust the FC compiler or FCFLAGS" "$LINENO" 5
+  fi
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgp-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether GPU version should be used" >&5
+$as_echo_n "checking whether GPU version should be used... " >&6; }
+# Check whether --enable-gpu was given.
+if test "${enable_gpu+set}" = set; then :
+  enableval=$enable_gpu; if test x"$enableval" = x"yes"; then
+                                  use_gpu=yes
+                                else
+                                  use_gpu=no
+                                fi
+else
+  use_gpu=no
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${use_gpu}" >&5
+$as_echo "${use_gpu}" >&6; }
+if test x"${use_gpu}" = x"yes" ; then
+  need_gpu=yes
+  use_real_gpu=yes
+  use_complex_gpu=yes
+fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-bgp-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-bgp-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-bgp-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+if test x"${need_gpu}" = x"yes" ; then
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+  CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
+  LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
+  NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
+  NVCC="nvcc"
 
 
 
+    # Extract the first word of "nvcc", so it can be a program name with args.
+set dummy nvcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_nvcc_found+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$nvcc_found"; then
+  ac_cv_prog_nvcc_found="$nvcc_found" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_nvcc_found="yes"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
-# Check whether --with-complex-bgq-kernel-only was given.
-if test "${with_complex_bgq_kernel_only+set}" = set; then :
-  withval=$with_complex_bgq_kernel_only; with_option=yes
-else
-  with_option=no
+  test -z "$ac_cv_prog_nvcc_found" && ac_cv_prog_nvcc_found="no"
+fi
+fi
+nvcc_found=$ac_cv_prog_nvcc_found
+if test -n "$nvcc_found"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $nvcc_found" >&5
+$as_echo "$nvcc_found" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_bgq=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
+  if test x"${nvcc_found}" = x"no" ; then
+    as_fn_error $? "nvcc not found; try to set the cuda-path or disable GPU support" "$LINENO" 5
+  fi
 
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing cublasDgemm" >&5
+$as_echo_n "checking for library containing cublasDgemm... " >&6; }
+if ${ac_cv_search_cublasDgemm+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char cublasDgemm ();
+int
+main ()
+{
+return cublasDgemm ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' cublas; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_search_cublasDgemm=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_cublasDgemm+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_cublasDgemm+:} false; then :
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+else
+  ac_cv_search_cublasDgemm=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_cublasDgemm" >&5
+$as_echo "$ac_cv_search_cublasDgemm" >&6; }
+ac_res=$ac_cv_search_cublasDgemm
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  have_cublas=yes
+else
+  have_cublas=no
+fi
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "bgq-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+  if test x"${have_cublas}" = x"no"; then
+    as_fn_error $? "Could not link cublas; try to set the cuda-path or disable GPU support" "$LINENO" 5
+  fi
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing cudaMemcpy" >&5
+$as_echo_n "checking for library containing cudaMemcpy... " >&6; }
+if ${ac_cv_search_cudaMemcpy+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-bgq-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-bgq-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-bgq-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char cudaMemcpy ();
+int
+main ()
+{
+return cudaMemcpy ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' cudart; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
   fi
+  if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_search_cudaMemcpy=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_cudaMemcpy+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_cudaMemcpy+:} false; then :
 
+else
+  ac_cv_search_cudaMemcpy=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_cudaMemcpy" >&5
+$as_echo "$ac_cv_search_cudaMemcpy" >&6; }
+ac_res=$ac_cv_search_cudaMemcpy
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  have_cudart=yes
+else
+  have_cudart=no
+fi
 
+  if test x"${have_cudart}" = x"no"; then
+    as_fn_error $? "Could not link cudart; try to set the cuda-path or disable GPU support" "$LINENO" 5
+  fi
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+fi
 
-# Check whether --with-complex-sse-block1-kernel-only was given.
-if test "${with_complex_sse_block1_kernel_only+set}" = set; then :
-  withval=$with_complex_sse_block1_kernel_only; with_option=yes
-else
-  with_option=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether GPU memory debugging should be enabled" >&5
+$as_echo_n "checking whether GPU memory debugging should be enabled... " >&6; }
+# Check whether --enable-gpu-memory-debug was given.
+if test "${enable_gpu_memory_debug+set}" = set; then :
+  enableval=$enable_gpu_memory_debug; if test x"$enableval" = x"yes"; then
+                                  enable_gpu_memory_debug=yes
+                                else
+                                  enable_gpu_memory_debug=no
+                                fi
+else
+  enable_gpu_memory_debug=no
 fi
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_gpu_memory_debug}" >&5
+$as_echo "${enable_gpu_memory_debug}" >&6; }
+if test x"${enable_gpu_memory_debug}" = x"yes" ; then
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_sse_block1=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
+$as_echo "#define DEBUG_CUDA 1" >>confdefs.h
 
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "complex-sse-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+fi
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "complex-sse-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "complex-sse-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "complex-sse-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "complex-sse-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+   if test x"$use_real_generic" = x"yes"; then
+  WITH_REAL_GENERIC_KERNEL_TRUE=
+  WITH_REAL_GENERIC_KERNEL_FALSE='#'
+else
+  WITH_REAL_GENERIC_KERNEL_TRUE='#'
+  WITH_REAL_GENERIC_KERNEL_FALSE=
+fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-sse-block1-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-sse-block1-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-sse-block1-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+  if test x"$use_real_generic" = x"yes" ; then
 
+$as_echo "#define WITH_REAL_GENERIC_KERNEL 1" >>confdefs.h
 
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_GENERIC_COMPILED=$kernel_defined
 
 
-# Check whether --with-complex-sse-block2-kernel-only was given.
-if test "${with_complex_sse_block2_kernel_only+set}" = set; then :
-  withval=$with_complex_sse_block2_kernel_only; with_option=yes
-else
-  with_option=no
+   if test x"$use_real_generic_simple" = x"yes"; then
+  WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE=
+  WITH_REAL_GENERIC_SIMPLE_KERNEL_FALSE='#'
+else
+  WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE='#'
+  WITH_REAL_GENERIC_SIMPLE_KERNEL_FALSE=
 fi
 
+  if test x"$use_real_generic_simple" = x"yes" ; then
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_sse_block2=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
+$as_echo "#define WITH_REAL_GENERIC_SIMPLE_KERNEL 1" >>confdefs.h
 
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "complex-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_GENERIC_SIMPLE_COMPILED=$kernel_defined
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "complex-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "complex-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "complex-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+   if test x"$use_real_generic_simple_block4" = x"yes"; then
+  WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_TRUE=
+  WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_FALSE='#'
+else
+  WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_TRUE='#'
+  WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_FALSE=
+fi
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "complex-sse-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+  if test x"$use_real_generic_simple_block4" = x"yes" ; then
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-sse-block2-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-sse-block2-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-sse-block2-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
+$as_echo "#define WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
   fi
+  ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED=$kernel_defined
 
 
+   if test x"$use_real_generic_simple_block6" = x"yes"; then
+  WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_TRUE=
+  WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_FALSE='#'
+else
+  WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_TRUE='#'
+  WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_FALSE=
+fi
 
+  if test x"$use_real_generic_simple_block6" = x"yes" ; then
 
-# Check whether --with-complex-avx-block1-kernel-only was given.
-if test "${with_complex_avx_block1_kernel_only+set}" = set; then :
-  withval=$with_complex_avx_block1_kernel_only; with_option=yes
-else
-  with_option=no
-fi
+$as_echo "#define WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL 1" >>confdefs.h
 
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED=$kernel_defined
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_avx_block1=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
 
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "complex-avx-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+   if test x"$use_complex_generic" = x"yes"; then
+  WITH_COMPLEX_GENERIC_KERNEL_TRUE=
+  WITH_COMPLEX_GENERIC_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_GENERIC_KERNEL_TRUE='#'
+  WITH_COMPLEX_GENERIC_KERNEL_FALSE=
+fi
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "complex-avx-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
+  if test x"$use_complex_generic" = x"yes" ; then
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "complex-avx-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "complex-avx-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+$as_echo "#define WITH_COMPLEX_GENERIC_KERNEL 1" >>confdefs.h
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "complex-avx-block1-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_GENERIC_COMPILED=$kernel_defined
+
+
+   if test x"$use_complex_generic_simple" = x"yes"; then
+  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE=
+  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE='#'
+  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_FALSE=
+fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-avx-block1-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-avx-block1-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-avx-block1-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
-  fi
+  if test x"$use_complex_generic_simple" = x"yes" ; then
 
+$as_echo "#define WITH_COMPLEX_GENERIC_SIMPLE_KERNEL 1" >>confdefs.h
 
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE_COMPILED=$kernel_defined
 
 
-# Check whether --with-complex-avx-block2-kernel-only was given.
-if test "${with_complex_avx_block2_kernel_only+set}" = set; then :
-  withval=$with_complex_avx_block2_kernel_only; with_option=yes
-else
-  with_option=no
+   if test x"$use_real_sparc64_block2" = x"yes"; then
+  WITH_REAL_SPARC64_BLOCK2_KERNEL_TRUE=
+  WITH_REAL_SPARC64_BLOCK2_KERNEL_FALSE='#'
+else
+  WITH_REAL_SPARC64_BLOCK2_KERNEL_TRUE='#'
+  WITH_REAL_SPARC64_BLOCK2_KERNEL_FALSE=
 fi
 
+  if test x"$use_real_sparc64_block2" = x"yes" ; then
 
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-        install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-        install_complex_avx_block2=yes
-        if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
+$as_echo "#define WITH_REAL_SPARC64_BLOCK2_KERNEL 1" >>confdefs.h
 
-        if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       as_fn_error $? "complex-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     fi
-    fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED=$kernel_defined
 
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "complex-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
 
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       as_fn_error $? "complex-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "complex-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+   if test x"$use_real_sparc64_block4" = x"yes"; then
+  WITH_REAL_SPARC64_BLOCK4_KERNEL_TRUE=
+  WITH_REAL_SPARC64_BLOCK4_KERNEL_FALSE='#'
+else
+  WITH_REAL_SPARC64_BLOCK4_KERNEL_TRUE='#'
+  WITH_REAL_SPARC64_BLOCK4_KERNEL_FALSE=
+fi
 
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       as_fn_error $? "complex-avx-block2-kernel kernel was set, but cannot be compiled!" "$LINENO" 5
-     else
-       want_avx=yes
-     fi
-    fi
+  if test x"$use_real_sparc64_block4" = x"yes" ; then
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: complex-avx-block2-kernel-only will be the only compiled kernel for complex case" >&5
-$as_echo "$as_me: complex-avx-block2-kernel-only will be the only compiled kernel for complex case" >&6;}
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "complex-avx-block2-kernel-only failed; A specific kernel for complex case has already been defined before!
-See \`config.log' for more details" "$LINENO" 5; }
-   fi
+$as_echo "#define WITH_REAL_SPARC64_BLOCK4_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
   fi
+  ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED=$kernel_defined
 
 
+   if test x"$use_real_sparc64_block6" = x"yes"; then
+  WITH_REAL_SPARC64_BLOCK6_KERNEL_TRUE=
+  WITH_REAL_SPARC64_BLOCK6_KERNEL_FALSE='#'
+else
+  WITH_REAL_SPARC64_BLOCK6_KERNEL_TRUE='#'
+  WITH_REAL_SPARC64_BLOCK6_KERNEL_FALSE=
+fi
 
-if test x"${can_use_iso_fortran_env}" = x"yes" ; then
+  if test x"$use_real_sparc64_block6" = x"yes" ; then
 
-$as_echo "#define HAVE_ISO_FORTRAN_ENV 1" >>confdefs.h
+$as_echo "#define WITH_REAL_SPARC64_BLOCK6_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED=$kernel_defined
 
- if test x"$install_real_generic" = x"yes"; then
-  WITH_REAL_GENERIC_KERNEL_TRUE=
-  WITH_REAL_GENERIC_KERNEL_FALSE='#'
+
+   if test x"$use_real_neon_arch64_block2" = x"yes"; then
+  WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_TRUE=
+  WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_FALSE='#'
 else
-  WITH_REAL_GENERIC_KERNEL_TRUE='#'
-  WITH_REAL_GENERIC_KERNEL_FALSE=
+  WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_TRUE='#'
+  WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_FALSE=
 fi
 
-if test x"${install_real_generic}" = x"yes" ; then
+  if test x"$use_real_neon_arch64_block2" = x"yes" ; then
 
-$as_echo "#define WITH_REAL_GENERIC_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED=$kernel_defined
 
- if test x"$install_complex_generic" = x"yes"; then
-  WITH_COMPLEX_GENERIC_KERNEL_TRUE=
-  WITH_COMPLEX_GENERIC_KERNEL_FALSE='#'
+
+   if test x"$use_real_neon_arch64_block4" = x"yes"; then
+  WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_TRUE=
+  WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_GENERIC_KERNEL_TRUE='#'
-  WITH_COMPLEX_GENERIC_KERNEL_FALSE=
+  WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_TRUE='#'
+  WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_generic}" = x"yes" ; then
+  if test x"$use_real_neon_arch64_block4" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_GENERIC_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED=$kernel_defined
 
- if test x"$install_real_generic_simple" = x"yes"; then
-  WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE=
-  WITH_REAL_GENERIC_SIMPLE_KERNEL_FALSE='#'
+
+   if test x"$use_real_neon_arch64_block6" = x"yes"; then
+  WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_TRUE=
+  WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_FALSE='#'
 else
-  WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE='#'
-  WITH_REAL_GENERIC_SIMPLE_KERNEL_FALSE=
+  WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_TRUE='#'
+  WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_FALSE=
 fi
 
-if test x"${install_real_generic_simple}" = x"yes" ; then
+  if test x"$use_real_neon_arch64_block6" = x"yes" ; then
 
-$as_echo "#define WITH_REAL_GENERIC_SIMPLE_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED=$kernel_defined
 
- if test x"$install_complex_generic_simple" = x"yes"; then
-  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE=
-  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_FALSE='#'
+
+   if test x"$use_real_vsx_block2" = x"yes"; then
+  WITH_REAL_VSX_BLOCK2_KERNEL_TRUE=
+  WITH_REAL_VSX_BLOCK2_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE='#'
-  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_FALSE=
+  WITH_REAL_VSX_BLOCK2_KERNEL_TRUE='#'
+  WITH_REAL_VSX_BLOCK2_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_generic_simple}" = x"yes" ; then
+  if test x"$use_real_vsx_block2" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_GENERIC_SIMPLE_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_REAL_VSX_BLOCK2_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED=$kernel_defined
 
- if test x"$install_real_sse_assembly" = x"yes"; then
-  WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE=
-  WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE='#'
+
+   if test x"$use_real_vsx_block4" = x"yes"; then
+  WITH_REAL_VSX_BLOCK4_KERNEL_TRUE=
+  WITH_REAL_VSX_BLOCK4_KERNEL_FALSE='#'
 else
-  WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE='#'
-  WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE=
+  WITH_REAL_VSX_BLOCK4_KERNEL_TRUE='#'
+  WITH_REAL_VSX_BLOCK4_KERNEL_FALSE=
 fi
 
-if test x"${install_real_sse_assembly}" = x"yes" ; then
+  if test x"$use_real_vsx_block4" = x"yes" ; then
 
-$as_echo "#define WITH_REAL_SSE_ASSEMBLY_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_REAL_VSX_BLOCK4_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED=$kernel_defined
 
- if test x"$install_complex_sse_assembly" = x"yes"; then
-  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE=
-  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_FALSE='#'
+
+   if test x"$use_real_vsx_block6" = x"yes"; then
+  WITH_REAL_VSX_BLOCK6_KERNEL_TRUE=
+  WITH_REAL_VSX_BLOCK6_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE='#'
-  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_FALSE=
+  WITH_REAL_VSX_BLOCK6_KERNEL_TRUE='#'
+  WITH_REAL_VSX_BLOCK6_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_sse_assembly}" = x"yes" ; then
+  if test x"$use_real_vsx_block6" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_SSE_ASSEMBLY_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_REAL_VSX_BLOCK6_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED=$kernel_defined
 
- if test x"$install_real_sse_block2" = x"yes"; then
+
+   if test x"$use_real_sse_block2" = x"yes"; then
   WITH_REAL_SSE_BLOCK2_KERNEL_TRUE=
   WITH_REAL_SSE_BLOCK2_KERNEL_FALSE='#'
 else
@@ -9423,13 +11982,18 @@
   WITH_REAL_SSE_BLOCK2_KERNEL_FALSE=
 fi
 
-if test x"${install_real_sse_block2}" = x"yes" ; then
+  if test x"$use_real_sse_block2" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_SSE_BLOCK2_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_SSE_BLOCK2_COMPILED=$kernel_defined
 
- if test x"$install_real_sse_block4" = x"yes"; then
+
+   if test x"$use_real_sse_block4" = x"yes"; then
   WITH_REAL_SSE_BLOCK4_KERNEL_TRUE=
   WITH_REAL_SSE_BLOCK4_KERNEL_FALSE='#'
 else
@@ -9437,13 +12001,18 @@
   WITH_REAL_SSE_BLOCK4_KERNEL_FALSE=
 fi
 
-if test x"${install_real_sse_block4}" = x"yes" ; then
+  if test x"$use_real_sse_block4" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_SSE_BLOCK4_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_SSE_BLOCK4_COMPILED=$kernel_defined
 
- if test x"$install_real_sse_block6" = x"yes"; then
+
+   if test x"$use_real_sse_block6" = x"yes"; then
   WITH_REAL_SSE_BLOCK6_KERNEL_TRUE=
   WITH_REAL_SSE_BLOCK6_KERNEL_FALSE='#'
 else
@@ -9451,13 +12020,94 @@
   WITH_REAL_SSE_BLOCK6_KERNEL_FALSE=
 fi
 
-if test x"${install_real_sse_block6}" = x"yes" ; then
+  if test x"$use_real_sse_block6" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_SSE_BLOCK6_KERNEL 1" >>confdefs.h
 
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_SSE_BLOCK6_COMPILED=$kernel_defined
+
+
+   if test x"$use_complex_sse_block1" = x"yes"; then
+  WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE=
+  WITH_COMPLEX_SSE_BLOCK1_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE='#'
+  WITH_COMPLEX_SSE_BLOCK1_KERNEL_FALSE=
+fi
+
+  if test x"$use_complex_sse_block1" = x"yes" ; then
+
+$as_echo "#define WITH_COMPLEX_SSE_BLOCK1_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_SSE_BLOCK1_COMPILED=$kernel_defined
+
+
+   if test x"$use_complex_sse_block2" = x"yes"; then
+  WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE=
+  WITH_COMPLEX_SSE_BLOCK2_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE='#'
+  WITH_COMPLEX_SSE_BLOCK2_KERNEL_FALSE=
+fi
+
+  if test x"$use_complex_sse_block2" = x"yes" ; then
+
+$as_echo "#define WITH_COMPLEX_SSE_BLOCK2_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_SSE_BLOCK2_COMPILED=$kernel_defined
+
+
+   if test x"$use_real_sse_assembly" = x"yes"; then
+  WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE=
+  WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE='#'
+else
+  WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE='#'
+  WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE=
+fi
+
+  if test x"$use_real_sse_assembly" = x"yes" ; then
+
+$as_echo "#define WITH_REAL_SSE_ASSEMBLY_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_SSE_ASSEMBLY_COMPILED=$kernel_defined
+
+
+   if test x"$use_complex_sse_assembly" = x"yes"; then
+  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE=
+  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE='#'
+  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_FALSE=
 fi
 
- if test x"$install_real_avx_block2" = x"yes"; then
+  if test x"$use_complex_sse_assembly" = x"yes" ; then
+
+$as_echo "#define WITH_COMPLEX_SSE_ASSEMBLY_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY_COMPILED=$kernel_defined
+
+
+   if test x"$use_real_avx_block2" = x"yes"; then
   WITH_REAL_AVX_BLOCK2_KERNEL_TRUE=
   WITH_REAL_AVX_BLOCK2_KERNEL_FALSE='#'
 else
@@ -9465,13 +12115,18 @@
   WITH_REAL_AVX_BLOCK2_KERNEL_FALSE=
 fi
 
-if test x"${install_real_avx_block2}" = x"yes" ; then
+  if test x"$use_real_avx_block2" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_AVX_BLOCK2_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX_BLOCK2_COMPILED=$kernel_defined
 
- if test x"$install_real_avx_block4" = x"yes"; then
+
+   if test x"$use_real_avx_block4" = x"yes"; then
   WITH_REAL_AVX_BLOCK4_KERNEL_TRUE=
   WITH_REAL_AVX_BLOCK4_KERNEL_FALSE='#'
 else
@@ -9479,13 +12134,18 @@
   WITH_REAL_AVX_BLOCK4_KERNEL_FALSE=
 fi
 
-if test x"${install_real_avx_block4}" = x"yes" ; then
+  if test x"$use_real_avx_block4" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_AVX_BLOCK4_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX_BLOCK4_COMPILED=$kernel_defined
 
- if test x"$install_real_avx_block6" = x"yes"; then
+
+   if test x"$use_real_avx_block6" = x"yes"; then
   WITH_REAL_AVX_BLOCK6_KERNEL_TRUE=
   WITH_REAL_AVX_BLOCK6_KERNEL_FALSE='#'
 else
@@ -9493,13 +12153,56 @@
   WITH_REAL_AVX_BLOCK6_KERNEL_FALSE=
 fi
 
-if test x"${install_real_avx_block6}" = x"yes" ; then
+  if test x"$use_real_avx_block6" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_AVX_BLOCK6_KERNEL 1" >>confdefs.h
 
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX_BLOCK6_COMPILED=$kernel_defined
+
+
+   if test x"$use_complex_avx_block1" = x"yes"; then
+  WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE=
+  WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE='#'
+  WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE=
+fi
+
+  if test x"$use_complex_avx_block1" = x"yes" ; then
+
+$as_echo "#define WITH_COMPLEX_AVX_BLOCK1_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_AVX_BLOCK1_COMPILED=$kernel_defined
+
+
+   if test x"$use_complex_avx_block2" = x"yes"; then
+  WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE=
+  WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE='#'
+  WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE=
 fi
 
- if test x"$install_real_avx2_block2" = x"yes"; then
+  if test x"$use_complex_avx_block2" = x"yes" ; then
+
+$as_echo "#define WITH_COMPLEX_AVX_BLOCK2_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_AVX_BLOCK2_COMPILED=$kernel_defined
+
+
+   if test x"$use_real_avx2_block2" = x"yes"; then
   WITH_REAL_AVX2_BLOCK2_KERNEL_TRUE=
   WITH_REAL_AVX2_BLOCK2_KERNEL_FALSE='#'
 else
@@ -9507,13 +12210,18 @@
   WITH_REAL_AVX2_BLOCK2_KERNEL_FALSE=
 fi
 
-if test x"${install_real_avx2_block2}" = x"yes" ; then
+  if test x"$use_real_avx2_block2" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_AVX2_BLOCK2_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX2_BLOCK2_COMPILED=$kernel_defined
 
- if test x"$install_real_avx2_block4" = x"yes"; then
+
+   if test x"$use_real_avx2_block4" = x"yes"; then
   WITH_REAL_AVX2_BLOCK4_KERNEL_TRUE=
   WITH_REAL_AVX2_BLOCK4_KERNEL_FALSE='#'
 else
@@ -9521,13 +12229,18 @@
   WITH_REAL_AVX2_BLOCK4_KERNEL_FALSE=
 fi
 
-if test x"${install_real_avx2_block4}" = x"yes" ; then
+  if test x"$use_real_avx2_block4" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_AVX2_BLOCK4_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX2_BLOCK4_COMPILED=$kernel_defined
 
- if test x"$install_real_avx2_block6" = x"yes"; then
+
+   if test x"$use_real_avx2_block6" = x"yes"; then
   WITH_REAL_AVX2_BLOCK6_KERNEL_TRUE=
   WITH_REAL_AVX2_BLOCK6_KERNEL_FALSE='#'
 else
@@ -9535,97 +12248,151 @@
   WITH_REAL_AVX2_BLOCK6_KERNEL_FALSE=
 fi
 
-if test x"${install_real_avx2_block6}" = x"yes" ; then
+  if test x"$use_real_avx2_block6" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_AVX2_BLOCK6_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX2_BLOCK6_COMPILED=$kernel_defined
 
- if test x"$install_complex_sse_block1" = x"yes"; then
-  WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE=
-  WITH_COMPLEX_SSE_BLOCK1_KERNEL_FALSE='#'
+
+   if test x"$use_complex_avx2_block1" = x"yes"; then
+  WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE=
+  WITH_COMPLEX_AVX2_BLOCK1_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE='#'
-  WITH_COMPLEX_SSE_BLOCK1_KERNEL_FALSE=
+  WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE='#'
+  WITH_COMPLEX_AVX2_BLOCK1_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_sse_block1}" = x"yes" ; then
+  if test x"$use_complex_avx2_block1" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_SSE_BLOCK1_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_COMPLEX_AVX2_BLOCK1_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_AVX2_BLOCK1_COMPILED=$kernel_defined
 
- if test x"$install_complex_sse_block2" = x"yes"; then
-  WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE=
-  WITH_COMPLEX_SSE_BLOCK2_KERNEL_FALSE='#'
+
+   if test x"$use_complex_avx2_block2" = x"yes"; then
+  WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE=
+  WITH_COMPLEX_AVX2_BLOCK2_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE='#'
-  WITH_COMPLEX_SSE_BLOCK2_KERNEL_FALSE=
+  WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE='#'
+  WITH_COMPLEX_AVX2_BLOCK2_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_sse_block2}" = x"yes" ; then
+  if test x"$use_complex_avx2_block2" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_SSE_BLOCK2_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_COMPLEX_AVX2_BLOCK2_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED=$kernel_defined
 
+
+   if test x"$use_real_avx512_block2" = x"yes"; then
+  WITH_REAL_AVX512_BLOCK2_KERNEL_TRUE=
+  WITH_REAL_AVX512_BLOCK2_KERNEL_FALSE='#'
+else
+  WITH_REAL_AVX512_BLOCK2_KERNEL_TRUE='#'
+  WITH_REAL_AVX512_BLOCK2_KERNEL_FALSE=
 fi
 
- if test x"$install_complex_avx_block1" = x"yes"; then
-  WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE=
-  WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE='#'
+  if test x"$use_real_avx512_block2" = x"yes" ; then
+
+$as_echo "#define WITH_REAL_AVX512_BLOCK2_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED=$kernel_defined
+
+
+   if test x"$use_real_avx512_block4" = x"yes"; then
+  WITH_REAL_AVX512_BLOCK4_KERNEL_TRUE=
+  WITH_REAL_AVX512_BLOCK4_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE='#'
-  WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE=
+  WITH_REAL_AVX512_BLOCK4_KERNEL_TRUE='#'
+  WITH_REAL_AVX512_BLOCK4_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_avx_block1}" = x"yes" ; then
+  if test x"$use_real_avx512_block4" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_AVX_BLOCK1_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_REAL_AVX512_BLOCK4_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED=$kernel_defined
 
-fi
 
- if test x"$install_complex_avx_block2" = x"yes"; then
-  WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE=
-  WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE='#'
+   if test x"$use_real_avx512_block6" = x"yes"; then
+  WITH_REAL_AVX512_BLOCK6_KERNEL_TRUE=
+  WITH_REAL_AVX512_BLOCK6_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE='#'
-  WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE=
+  WITH_REAL_AVX512_BLOCK6_KERNEL_TRUE='#'
+  WITH_REAL_AVX512_BLOCK6_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_avx_block2}" = x"yes" ; then
+  if test x"$use_real_avx512_block6" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_AVX_BLOCK2_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_REAL_AVX512_BLOCK6_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED=$kernel_defined
 
- if test x"$install_complex_avx2_block1" = x"yes"; then
-  WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE=
-  WITH_COMPLEX_AVX2_BLOCK1_KERNEL_FALSE='#'
+
+   if test x"$use_complex_avx512_block1" = x"yes"; then
+  WITH_COMPLEX_AVX512_BLOCK1_KERNEL_TRUE=
+  WITH_COMPLEX_AVX512_BLOCK1_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE='#'
-  WITH_COMPLEX_AVX2_BLOCK1_KERNEL_FALSE=
+  WITH_COMPLEX_AVX512_BLOCK1_KERNEL_TRUE='#'
+  WITH_COMPLEX_AVX512_BLOCK1_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_avx2_block1}" = x"yes" ; then
+  if test x"$use_complex_avx512_block1" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_AVX2_BLOCK1_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_COMPLEX_AVX512_BLOCK1_KERNEL 1" >>confdefs.h
 
-fi
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED=$kernel_defined
 
- if test x"$install_complex_avx2_block2" = x"yes"; then
-  WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE=
-  WITH_COMPLEX_AVX2_BLOCK2_KERNEL_FALSE='#'
+
+   if test x"$use_complex_avx512_block2" = x"yes"; then
+  WITH_COMPLEX_AVX512_BLOCK2_KERNEL_TRUE=
+  WITH_COMPLEX_AVX512_BLOCK2_KERNEL_FALSE='#'
 else
-  WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE='#'
-  WITH_COMPLEX_AVX2_BLOCK2_KERNEL_FALSE=
+  WITH_COMPLEX_AVX512_BLOCK2_KERNEL_TRUE='#'
+  WITH_COMPLEX_AVX512_BLOCK2_KERNEL_FALSE=
 fi
 
-if test x"${install_complex_avx2_block2}" = x"yes" ; then
+  if test x"$use_complex_avx512_block2" = x"yes" ; then
 
-$as_echo "#define WITH_COMPLEX_AVX2_BLOCK2_KERNEL 1" >>confdefs.h
+$as_echo "#define WITH_COMPLEX_AVX512_BLOCK2_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED=$kernel_defined
 
-fi
 
- if test x"$install_real_bgp" = x"yes"; then
+   if test x"$use_real_bgp" = x"yes"; then
   WITH_REAL_BGP_KERNEL_TRUE=
   WITH_REAL_BGP_KERNEL_FALSE='#'
 else
@@ -9633,13 +12400,37 @@
   WITH_REAL_BGP_KERNEL_FALSE=
 fi
 
-if test x"${install_real_bgp}" = x"yes" ; then
+  if test x"$use_real_bgp" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_BGP_KERNEL 1" >>confdefs.h
 
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_BGP_COMPILED=$kernel_defined
+
+
+   if test x"$use_complex_bgp" = x"yes"; then
+  WITH_COMPLEX_BGP_KERNEL_TRUE=
+  WITH_COMPLEX_BGP_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_BGP_KERNEL_TRUE='#'
+  WITH_COMPLEX_BGP_KERNEL_FALSE=
 fi
 
- if test x"$install_real_bgq" = x"yes"; then
+  if test x"$use_complex_bgp" = x"yes" ; then
+
+$as_echo "#define WITH_COMPLEX_BGP_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_BGP_COMPILED=$kernel_defined
+
+
+   if test x"$use_real_bgq" = x"yes"; then
   WITH_REAL_BGQ_KERNEL_TRUE=
   WITH_REAL_BGQ_KERNEL_FALSE='#'
 else
@@ -9647,32 +12438,155 @@
   WITH_REAL_BGQ_KERNEL_FALSE=
 fi
 
-if test x"${install_real_bgq}" = x"yes" ; then
+  if test x"$use_real_bgq" = x"yes" ; then
 
 $as_echo "#define WITH_REAL_BGQ_KERNEL 1" >>confdefs.h
 
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_REAL_BGQ_COMPILED=$kernel_defined
+
+
+   if test x"$use_complex_bgq" = x"yes"; then
+  WITH_COMPLEX_BGQ_KERNEL_TRUE=
+  WITH_COMPLEX_BGQ_KERNEL_FALSE='#'
+else
+  WITH_COMPLEX_BGQ_KERNEL_TRUE='#'
+  WITH_COMPLEX_BGQ_KERNEL_FALSE=
 fi
 
-if test x"${use_specific_complex_kernel}" = x"no" ; then
+  if test x"$use_complex_bgq" = x"yes" ; then
+
+$as_echo "#define WITH_COMPLEX_BGQ_KERNEL 1" >>confdefs.h
+
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  ELPA_2STAGE_COMPLEX_BGQ_COMPILED=$kernel_defined
 
-$as_echo "#define WITH_NO_SPECIFIC_COMPLEX_KERNEL 1" >>confdefs.h
 
+
+ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"; then
+  WITH_GPU_VERSION_TRUE=
+  WITH_GPU_VERSION_FALSE='#'
 else
+  WITH_GPU_VERSION_TRUE='#'
+  WITH_GPU_VERSION_FALSE=
+fi
 
-$as_echo "#define WITH_ONE_SPECIFIC_COMPLEX_KERNEL 1" >>confdefs.h
+if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
 
+$as_echo "#define WITH_GPU_VERSION 1" >>confdefs.h
+
+  #AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
+  ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
+  ELPA_2STAGE_REAL_GPU_COMPILED=1
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether --enable-nvtx is specified" >&5
+$as_echo_n "checking whether --enable-nvtx is specified... " >&6; }
+  # Check whether --enable-nvtx was given.
+if test "${enable_nvtx+set}" = set; then :
+  enableval=$enable_nvtx;
+                 if test x"$enableval" = x"yes"; then
+                   enable_nvtx=yes
+                 else
+                   enable_nvtx=no
+                 fi
+
+else
+  enable_nvtx=no
 fi
 
-if test x"${use_specific_real_kernel}" = x"no" ; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_nvtx}" >&5
+$as_echo "${enable_nvtx}" >&6; }
+  if test x"${enable_nvtx}" = x"yes"; then
+
+$as_echo "#define WITH_NVTX 1" >>confdefs.h
+
+    ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing nvtxRangePop" >&5
+$as_echo_n "checking for library containing nvtxRangePop... " >&6; }
+if ${ac_cv_search_nvtxRangePop+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-$as_echo "#define WITH_NO_SPECIFIC_REAL_KERNEL 1" >>confdefs.h
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char nvtxRangePop ();
+int
+main ()
+{
+return nvtxRangePop ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' nvToolsExt; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_search_nvtxRangePop=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_nvtxRangePop+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_nvtxRangePop+:} false; then :
 
 else
+  ac_cv_search_nvtxRangePop=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_nvtxRangePop" >&5
+$as_echo "$ac_cv_search_nvtxRangePop" >&6; }
+ac_res=$ac_cv_search_nvtxRangePop
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+  have_nvtoolsext=yes
+else
+  have_nvtoolsext=no
+fi
 
-$as_echo "#define WITH_ONE_SPECIFIC_REAL_KERNEL 1" >>confdefs.h
+    if test x"${have_nvtoolsext}" = x"no"; then
+      as_fn_error $? "Could not link nvToolsExt; try to set the cuda-path or disable GPU support " "$LINENO" 5
+    fi
+    ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+  fi
+else
+  ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
+  ELPA_2STAGE_REAL_GPU_COMPILED=0
 fi
 
+
+
 case `pwd` in
   *\ * | *\	*)
     { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Libtool does not cope well with whitespace in \`pwd\`" >&5
@@ -9778,165 +12692,23 @@
      done
      echo "$ac_script" 2>/dev/null | sed 99q >conftest.sed
      { ac_script=; unset ac_script;}
-     if test -z "$SED"; then
-  ac_path_SED_found=false
-  # Loop through the user's path and test for each of PROGNAME-LIST
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_prog in sed gsed; do
-    for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_SED="$as_dir/$ac_prog$ac_exec_ext"
-      as_fn_executable_p "$ac_path_SED" || continue
-# Check for GNU ac_path_SED and select it if it is found.
-  # Check for GNU $ac_path_SED
-case `"$ac_path_SED" --version 2>&1` in
-*GNU*)
-  ac_cv_path_SED="$ac_path_SED" ac_path_SED_found=:;;
-*)
-  ac_count=0
-  $as_echo_n 0123456789 >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    $as_echo '' >> "conftest.nl"
-    "$ac_path_SED" -f conftest.sed < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
-    if test $ac_count -gt ${ac_path_SED_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_SED="$ac_path_SED"
-      ac_path_SED_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-      $ac_path_SED_found && break 3
-    done
-  done
-  done
-IFS=$as_save_IFS
-  if test -z "$ac_cv_path_SED"; then
-    as_fn_error $? "no acceptable sed could be found in \$PATH" "$LINENO" 5
-  fi
-else
-  ac_cv_path_SED=$SED
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_SED" >&5
-$as_echo "$ac_cv_path_SED" >&6; }
- SED="$ac_cv_path_SED"
-  rm -f conftest.sed
-
-test -z "$SED" && SED=sed
-Xsed="$SED -e 1s/^X//"
-
-
-
-
-
-
-
-
-
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
-$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
-if ${ac_cv_path_GREP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$GREP"; then
-  ac_path_GREP_found=false
-  # Loop through the user's path and test for each of PROGNAME-LIST
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_prog in grep ggrep; do
-    for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
-      as_fn_executable_p "$ac_path_GREP" || continue
-# Check for GNU ac_path_GREP and select it if it is found.
-  # Check for GNU $ac_path_GREP
-case `"$ac_path_GREP" --version 2>&1` in
-*GNU*)
-  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
-*)
-  ac_count=0
-  $as_echo_n 0123456789 >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    $as_echo 'GREP' >> "conftest.nl"
-    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
-    if test $ac_count -gt ${ac_path_GREP_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_GREP="$ac_path_GREP"
-      ac_path_GREP_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-      $ac_path_GREP_found && break 3
-    done
-  done
-  done
-IFS=$as_save_IFS
-  if test -z "$ac_cv_path_GREP"; then
-    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
-  fi
-else
-  ac_cv_path_GREP=$GREP
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
-$as_echo "$ac_cv_path_GREP" >&6; }
- GREP="$ac_cv_path_GREP"
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
-$as_echo_n "checking for egrep... " >&6; }
-if ${ac_cv_path_EGREP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
-   then ac_cv_path_EGREP="$GREP -E"
-   else
-     if test -z "$EGREP"; then
-  ac_path_EGREP_found=false
+     if test -z "$SED"; then
+  ac_path_SED_found=false
   # Loop through the user's path and test for each of PROGNAME-LIST
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+for as_dir in $PATH
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_prog in egrep; do
+    for ac_prog in sed gsed; do
     for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
-      as_fn_executable_p "$ac_path_EGREP" || continue
-# Check for GNU ac_path_EGREP and select it if it is found.
-  # Check for GNU $ac_path_EGREP
-case `"$ac_path_EGREP" --version 2>&1` in
+      ac_path_SED="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_SED" || continue
+# Check for GNU ac_path_SED and select it if it is found.
+  # Check for GNU $ac_path_SED
+case `"$ac_path_SED" --version 2>&1` in
 *GNU*)
-  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+  ac_cv_path_SED="$ac_path_SED" ac_path_SED_found=:;;
 *)
   ac_count=0
   $as_echo_n 0123456789 >"conftest.in"
@@ -9945,14 +12717,14 @@
     cat "conftest.in" "conftest.in" >"conftest.tmp"
     mv "conftest.tmp" "conftest.in"
     cp "conftest.in" "conftest.nl"
-    $as_echo 'EGREP' >> "conftest.nl"
-    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    $as_echo '' >> "conftest.nl"
+    "$ac_path_SED" -f conftest.sed < "conftest.nl" >"conftest.out" 2>/dev/null || break
     diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
     as_fn_arith $ac_count + 1 && ac_count=$as_val
-    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+    if test $ac_count -gt ${ac_path_SED_max-0}; then
       # Best one so far, save it but keep looking for a better one
-      ac_cv_path_EGREP="$ac_path_EGREP"
-      ac_path_EGREP_max=$ac_count
+      ac_cv_path_SED="$ac_path_SED"
+      ac_path_SED_max=$ac_count
     fi
     # 10*(2^10) chars as input seems more than enough
     test $ac_count -gt 10 && break
@@ -9960,23 +12732,35 @@
   rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
 esac
 
-      $ac_path_EGREP_found && break 3
+      $ac_path_SED_found && break 3
     done
   done
   done
 IFS=$as_save_IFS
-  if test -z "$ac_cv_path_EGREP"; then
-    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  if test -z "$ac_cv_path_SED"; then
+    as_fn_error $? "no acceptable sed could be found in \$PATH" "$LINENO" 5
   fi
 else
-  ac_cv_path_EGREP=$EGREP
+  ac_cv_path_SED=$SED
 fi
 
-   fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
-$as_echo "$ac_cv_path_EGREP" >&6; }
- EGREP="$ac_cv_path_EGREP"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_SED" >&5
+$as_echo "$ac_cv_path_SED" >&6; }
+ SED="$ac_cv_path_SED"
+  rm -f conftest.sed
+
+test -z "$SED" && SED=sed
+Xsed="$SED -e 1s/^X//"
+
+
+
+
+
+
+
+
+
 
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for fgrep" >&5
@@ -11291,12 +14075,18 @@
   $as_echo_n "(cached) " >&6
 else
   lt_cv_ar_at_file=no
-   cat > conftest.$ac_ext <<_ACEOF
-      program main
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-      end
+int
+main ()
+{
+
+  ;
+  return 0;
+}
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+if ac_fn_c_try_compile "$LINENO"; then :
   echo conftest.$ac_objext > conftest.lst
       lt_ar_try='$AR $AR_FLAGS libconftest.a @conftest.lst >&5'
       { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$lt_ar_try\""; } >&5
@@ -12565,193 +15355,9 @@
 if ${ac_cv_prog_ac_ct_NMEDIT+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$ac_ct_NMEDIT"; then
-  ac_cv_prog_ac_ct_NMEDIT="$ac_ct_NMEDIT" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_NMEDIT="nmedit"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_NMEDIT=$ac_cv_prog_ac_ct_NMEDIT
-if test -n "$ac_ct_NMEDIT"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_NMEDIT" >&5
-$as_echo "$ac_ct_NMEDIT" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-  if test "x$ac_ct_NMEDIT" = x; then
-    NMEDIT=":"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    NMEDIT=$ac_ct_NMEDIT
-  fi
-else
-  NMEDIT="$ac_cv_prog_NMEDIT"
-fi
-
-    if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}lipo", so it can be a program name with args.
-set dummy ${ac_tool_prefix}lipo; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_LIPO+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$LIPO"; then
-  ac_cv_prog_LIPO="$LIPO" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_LIPO="${ac_tool_prefix}lipo"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-LIPO=$ac_cv_prog_LIPO
-if test -n "$LIPO"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LIPO" >&5
-$as_echo "$LIPO" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-fi
-if test -z "$ac_cv_prog_LIPO"; then
-  ac_ct_LIPO=$LIPO
-  # Extract the first word of "lipo", so it can be a program name with args.
-set dummy lipo; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_LIPO+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_LIPO"; then
-  ac_cv_prog_ac_ct_LIPO="$ac_ct_LIPO" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_LIPO="lipo"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_LIPO=$ac_cv_prog_ac_ct_LIPO
-if test -n "$ac_ct_LIPO"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_LIPO" >&5
-$as_echo "$ac_ct_LIPO" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-  if test "x$ac_ct_LIPO" = x; then
-    LIPO=":"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    LIPO=$ac_ct_LIPO
-  fi
-else
-  LIPO="$ac_cv_prog_LIPO"
-fi
-
-    if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}otool", so it can be a program name with args.
-set dummy ${ac_tool_prefix}otool; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_OTOOL+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$OTOOL"; then
-  ac_cv_prog_OTOOL="$OTOOL" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_OTOOL="${ac_tool_prefix}otool"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-OTOOL=$ac_cv_prog_OTOOL
-if test -n "$OTOOL"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OTOOL" >&5
-$as_echo "$OTOOL" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-fi
-if test -z "$ac_cv_prog_OTOOL"; then
-  ac_ct_OTOOL=$OTOOL
-  # Extract the first word of "otool", so it can be a program name with args.
-set dummy otool; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_OTOOL+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_OTOOL"; then
-  ac_cv_prog_ac_ct_OTOOL="$ac_ct_OTOOL" # Let the user override the test.
-else
+  if test -n "$ac_ct_NMEDIT"; then
+  ac_cv_prog_ac_ct_NMEDIT="$ac_ct_NMEDIT" # Let the user override the test.
+else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
@@ -12759,7 +15365,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_OTOOL="otool"
+    ac_cv_prog_ac_ct_NMEDIT="nmedit"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -12769,17 +15375,17 @@
 
 fi
 fi
-ac_ct_OTOOL=$ac_cv_prog_ac_ct_OTOOL
-if test -n "$ac_ct_OTOOL"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OTOOL" >&5
-$as_echo "$ac_ct_OTOOL" >&6; }
+ac_ct_NMEDIT=$ac_cv_prog_ac_ct_NMEDIT
+if test -n "$ac_ct_NMEDIT"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_NMEDIT" >&5
+$as_echo "$ac_ct_NMEDIT" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_ct_OTOOL" = x; then
-    OTOOL=":"
+  if test "x$ac_ct_NMEDIT" = x; then
+    NMEDIT=":"
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -12787,22 +15393,22 @@
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    OTOOL=$ac_ct_OTOOL
+    NMEDIT=$ac_ct_NMEDIT
   fi
 else
-  OTOOL="$ac_cv_prog_OTOOL"
+  NMEDIT="$ac_cv_prog_NMEDIT"
 fi
 
     if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}otool64", so it can be a program name with args.
-set dummy ${ac_tool_prefix}otool64; ac_word=$2
+  # Extract the first word of "${ac_tool_prefix}lipo", so it can be a program name with args.
+set dummy ${ac_tool_prefix}lipo; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_OTOOL64+:} false; then :
+if ${ac_cv_prog_LIPO+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$OTOOL64"; then
-  ac_cv_prog_OTOOL64="$OTOOL64" # Let the user override the test.
+  if test -n "$LIPO"; then
+  ac_cv_prog_LIPO="$LIPO" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
@@ -12811,7 +15417,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_OTOOL64="${ac_tool_prefix}otool64"
+    ac_cv_prog_LIPO="${ac_tool_prefix}lipo"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -12821,10 +15427,10 @@
 
 fi
 fi
-OTOOL64=$ac_cv_prog_OTOOL64
-if test -n "$OTOOL64"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OTOOL64" >&5
-$as_echo "$OTOOL64" >&6; }
+LIPO=$ac_cv_prog_LIPO
+if test -n "$LIPO"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LIPO" >&5
+$as_echo "$LIPO" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
@@ -12832,17 +15438,17 @@
 
 
 fi
-if test -z "$ac_cv_prog_OTOOL64"; then
-  ac_ct_OTOOL64=$OTOOL64
-  # Extract the first word of "otool64", so it can be a program name with args.
-set dummy otool64; ac_word=$2
+if test -z "$ac_cv_prog_LIPO"; then
+  ac_ct_LIPO=$LIPO
+  # Extract the first word of "lipo", so it can be a program name with args.
+set dummy lipo; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_OTOOL64+:} false; then :
+if ${ac_cv_prog_ac_ct_LIPO+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$ac_ct_OTOOL64"; then
-  ac_cv_prog_ac_ct_OTOOL64="$ac_ct_OTOOL64" # Let the user override the test.
+  if test -n "$ac_ct_LIPO"; then
+  ac_cv_prog_ac_ct_LIPO="$ac_ct_LIPO" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
@@ -12851,7 +15457,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_OTOOL64="otool64"
+    ac_cv_prog_ac_ct_LIPO="lipo"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -12861,17 +15467,17 @@
 
 fi
 fi
-ac_ct_OTOOL64=$ac_cv_prog_ac_ct_OTOOL64
-if test -n "$ac_ct_OTOOL64"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OTOOL64" >&5
-$as_echo "$ac_ct_OTOOL64" >&6; }
+ac_ct_LIPO=$ac_cv_prog_ac_ct_LIPO
+if test -n "$ac_ct_LIPO"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_LIPO" >&5
+$as_echo "$ac_ct_LIPO" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_ct_OTOOL64" = x; then
-    OTOOL64=":"
+  if test "x$ac_ct_LIPO" = x; then
+    LIPO=":"
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -12879,477 +15485,394 @@
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    OTOOL64=$ac_ct_OTOOL64
+    LIPO=$ac_ct_LIPO
   fi
 else
-  OTOOL64="$ac_cv_prog_OTOOL64"
+  LIPO="$ac_cv_prog_LIPO"
 fi
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -single_module linker flag" >&5
-$as_echo_n "checking for -single_module linker flag... " >&6; }
-if ${lt_cv_apple_cc_single_mod+:} false; then :
+    if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}otool", so it can be a program name with args.
+set dummy ${ac_tool_prefix}otool; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_OTOOL+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  lt_cv_apple_cc_single_mod=no
-      if test -z "$LT_MULTI_MODULE"; then
-	# By default we will add the -single_module flag. You can override
-	# by either setting the environment variable LT_MULTI_MODULE
-	# non-empty at configure time, or by adding -multi_module to the
-	# link flags.
-	rm -rf libconftest.dylib*
-	echo "int foo(void){return 1;}" > conftest.c
-	echo "$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
--dynamiclib -Wl,-single_module conftest.c" >&5
-	$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
-	  -dynamiclib -Wl,-single_module conftest.c 2>conftest.err
-        _lt_result=$?
-	# If there is a non-empty error log, and "single_module"
-	# appears in it, assume the flag caused a linker warning
-        if test -s conftest.err && $GREP single_module conftest.err; then
-	  cat conftest.err >&5
-	# Otherwise, if the output was created with a 0 exit code from
-	# the compiler, it worked.
-	elif test -f libconftest.dylib && test 0 = "$_lt_result"; then
-	  lt_cv_apple_cc_single_mod=yes
-	else
-	  cat conftest.err >&5
-	fi
-	rm -rf libconftest.dylib*
-	rm -f conftest.*
-      fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_apple_cc_single_mod" >&5
-$as_echo "$lt_cv_apple_cc_single_mod" >&6; }
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -exported_symbols_list linker flag" >&5
-$as_echo_n "checking for -exported_symbols_list linker flag... " >&6; }
-if ${lt_cv_ld_exported_symbols_list+:} false; then :
-  $as_echo_n "(cached) " >&6
+  if test -n "$OTOOL"; then
+  ac_cv_prog_OTOOL="$OTOOL" # Let the user override the test.
 else
-  lt_cv_ld_exported_symbols_list=no
-      save_LDFLAGS=$LDFLAGS
-      echo "_main" > conftest.sym
-      LDFLAGS="$LDFLAGS -Wl,-exported_symbols_list,conftest.sym"
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_OTOOL="${ac_tool_prefix}otool"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  lt_cv_ld_exported_symbols_list=yes
-else
-  lt_cv_ld_exported_symbols_list=no
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-	LDFLAGS=$save_LDFLAGS
-
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_ld_exported_symbols_list" >&5
-$as_echo "$lt_cv_ld_exported_symbols_list" >&6; }
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -force_load linker flag" >&5
-$as_echo_n "checking for -force_load linker flag... " >&6; }
-if ${lt_cv_ld_force_load+:} false; then :
-  $as_echo_n "(cached) " >&6
+OTOOL=$ac_cv_prog_OTOOL
+if test -n "$OTOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OTOOL" >&5
+$as_echo "$OTOOL" >&6; }
 else
-  lt_cv_ld_force_load=no
-      cat > conftest.c << _LT_EOF
-int forced_loaded() { return 2;}
-_LT_EOF
-      echo "$LTCC $LTCFLAGS -c -o conftest.o conftest.c" >&5
-      $LTCC $LTCFLAGS -c -o conftest.o conftest.c 2>&5
-      echo "$AR cru libconftest.a conftest.o" >&5
-      $AR cru libconftest.a conftest.o 2>&5
-      echo "$RANLIB libconftest.a" >&5
-      $RANLIB libconftest.a 2>&5
-      cat > conftest.c << _LT_EOF
-int main() { return 0;}
-_LT_EOF
-      echo "$LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a" >&5
-      $LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a 2>conftest.err
-      _lt_result=$?
-      if test -s conftest.err && $GREP force_load conftest.err; then
-	cat conftest.err >&5
-      elif test -f conftest && test 0 = "$_lt_result" && $GREP forced_load conftest >/dev/null 2>&1; then
-	lt_cv_ld_force_load=yes
-      else
-	cat conftest.err >&5
-      fi
-        rm -f conftest.err libconftest.a conftest conftest.c
-        rm -rf conftest.dSYM
-
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_ld_force_load" >&5
-$as_echo "$lt_cv_ld_force_load" >&6; }
-    case $host_os in
-    rhapsody* | darwin1.[012])
-      _lt_dar_allow_undefined='$wl-undefined ${wl}suppress' ;;
-    darwin1.*)
-      _lt_dar_allow_undefined='$wl-flat_namespace $wl-undefined ${wl}suppress' ;;
-    darwin*) # darwin 5.x on
-      # if running on 10.5 or later, the deployment target defaults
-      # to the OS version, if on x86, and 10.4, the deployment
-      # target defaults to 10.4. Don't you love it?
-      case ${MACOSX_DEPLOYMENT_TARGET-10.0},$host in
-	10.0,*86*-darwin8*|10.0,*-darwin[91]*)
-	  _lt_dar_allow_undefined='$wl-undefined ${wl}dynamic_lookup' ;;
-	10.[012][,.]*)
-	  _lt_dar_allow_undefined='$wl-flat_namespace $wl-undefined ${wl}suppress' ;;
-	10.*)
-	  _lt_dar_allow_undefined='$wl-undefined ${wl}dynamic_lookup' ;;
-      esac
-    ;;
-  esac
-    if test yes = "$lt_cv_apple_cc_single_mod"; then
-      _lt_dar_single_mod='$single_module'
-    fi
-    if test yes = "$lt_cv_ld_exported_symbols_list"; then
-      _lt_dar_export_syms=' $wl-exported_symbols_list,$output_objdir/$libname-symbols.expsym'
-    else
-      _lt_dar_export_syms='~$NMEDIT -s $output_objdir/$libname-symbols.expsym $lib'
-    fi
-    if test : != "$DSYMUTIL" && test no = "$lt_cv_ld_force_load"; then
-      _lt_dsymutil='~$DSYMUTIL $lib || :'
-    else
-      _lt_dsymutil=
-    fi
-    ;;
-  esac
 
-# func_munge_path_list VARIABLE PATH
-# -----------------------------------
-# VARIABLE is name of variable containing _space_ separated list of
-# directories to be munged by the contents of PATH, which is string
-# having a format:
-# "DIR[:DIR]:"
-#       string "DIR[ DIR]" will be prepended to VARIABLE
-# ":DIR[:DIR]"
-#       string "DIR[ DIR]" will be appended to VARIABLE
-# "DIRP[:DIRP]::[DIRA:]DIRA"
-#       string "DIRP[ DIRP]" will be prepended to VARIABLE and string
-#       "DIRA[ DIRA]" will be appended to VARIABLE
-# "DIR[:DIR]"
-#       VARIABLE will be replaced by "DIR[ DIR]"
-func_munge_path_list ()
-{
-    case x$2 in
-    x)
-        ;;
-    *:)
-        eval $1=\"`$ECHO $2 | $SED 's/:/ /g'` \$$1\"
-        ;;
-    x:*)
-        eval $1=\"\$$1 `$ECHO $2 | $SED 's/:/ /g'`\"
-        ;;
-    *::*)
-        eval $1=\"\$$1\ `$ECHO $2 | $SED -e 's/.*:://' -e 's/:/ /g'`\"
-        eval $1=\"`$ECHO $2 | $SED -e 's/::.*//' -e 's/:/ /g'`\ \$$1\"
-        ;;
-    *)
-        eval $1=\"`$ECHO $2 | $SED 's/:/ /g'`\"
-        ;;
-    esac
-}
 
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
-$as_echo_n "checking how to run the C preprocessor... " >&6; }
-# On Suns, sometimes $CPP names a directory.
-if test -n "$CPP" && test -d "$CPP"; then
-  CPP=
 fi
-if test -z "$CPP"; then
-  if ${ac_cv_prog_CPP+:} false; then :
+if test -z "$ac_cv_prog_OTOOL"; then
+  ac_ct_OTOOL=$OTOOL
+  # Extract the first word of "otool", so it can be a program name with args.
+set dummy otool; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_OTOOL+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-      # Double quotes because CPP needs to be expanded
-    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
-    do
-      ac_preproc_ok=false
-for ac_c_preproc_warn_flag in '' yes
+  if test -n "$ac_ct_OTOOL"; then
+  ac_cv_prog_ac_ct_OTOOL="$ac_ct_OTOOL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
 do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_OTOOL="otool"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
+fi
+fi
+ac_ct_OTOOL=$ac_cv_prog_ac_ct_OTOOL
+if test -n "$ac_ct_OTOOL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OTOOL" >&5
+$as_echo "$ac_ct_OTOOL" >&6; }
 else
-  # Broken: fails on valid input.
-continue
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
-rm -f conftest.err conftest.i conftest.$ac_ext
 
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
+  if test "x$ac_ct_OTOOL" = x; then
+    OTOOL=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    OTOOL=$ac_ct_OTOOL
+  fi
 else
-  # Passes both tests.
-ac_preproc_ok=:
-break
+  OTOOL="$ac_cv_prog_OTOOL"
 fi
-rm -f conftest.err conftest.i conftest.$ac_ext
 
+    if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}otool64", so it can be a program name with args.
+set dummy ${ac_tool_prefix}otool64; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_OTOOL64+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$OTOOL64"; then
+  ac_cv_prog_OTOOL64="$OTOOL64" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_OTOOL64="${ac_tool_prefix}otool64"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
 done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-  break
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+OTOOL64=$ac_cv_prog_OTOOL64
+if test -n "$OTOOL64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OTOOL64" >&5
+$as_echo "$OTOOL64" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
-    done
-    ac_cv_prog_CPP=$CPP
 
 fi
-  CPP=$ac_cv_prog_CPP
+if test -z "$ac_cv_prog_OTOOL64"; then
+  ac_ct_OTOOL64=$OTOOL64
+  # Extract the first word of "otool64", so it can be a program name with args.
+set dummy otool64; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_OTOOL64+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-  ac_cv_prog_CPP=$CPP
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
-$as_echo "$CPP" >&6; }
-ac_preproc_ok=false
-for ac_c_preproc_warn_flag in '' yes
+  if test -n "$ac_ct_OTOOL64"; then
+  ac_cv_prog_ac_ct_OTOOL64="$ac_ct_OTOOL64" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
 do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_OTOOL64="otool64"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
+fi
+fi
+ac_ct_OTOOL64=$ac_cv_prog_ac_ct_OTOOL64
+if test -n "$ac_ct_OTOOL64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_OTOOL64" >&5
+$as_echo "$ac_ct_OTOOL64" >&6; }
 else
-  # Broken: fails on valid input.
-continue
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
-rm -f conftest.err conftest.i conftest.$ac_ext
 
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
+  if test "x$ac_ct_OTOOL64" = x; then
+    OTOOL64=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    OTOOL64=$ac_ct_OTOOL64
+  fi
 else
-  # Passes both tests.
-ac_preproc_ok=:
-break
+  OTOOL64="$ac_cv_prog_OTOOL64"
 fi
-rm -f conftest.err conftest.i conftest.$ac_ext
 
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
 
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
-See \`config.log' for more details" "$LINENO" 5; }
-fi
 
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
-$as_echo_n "checking for ANSI C header files... " >&6; }
-if ${ac_cv_header_stdc+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <float.h>
 
-int
-main ()
-{
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_header_stdc=yes
-else
-  ac_cv_header_stdc=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
-if test $ac_cv_header_stdc = yes; then
-  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <string.h>
 
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "memchr" >/dev/null 2>&1; then :
 
-else
-  ac_cv_header_stdc=no
-fi
-rm -f conftest*
 
-fi
 
-if test $ac_cv_header_stdc = yes; then
-  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdlib.h>
 
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "free" >/dev/null 2>&1; then :
 
-else
-  ac_cv_header_stdc=no
-fi
-rm -f conftest*
 
+
+
+
+
+
+
+
+
+
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -single_module linker flag" >&5
+$as_echo_n "checking for -single_module linker flag... " >&6; }
+if ${lt_cv_apple_cc_single_mod+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_apple_cc_single_mod=no
+      if test -z "$LT_MULTI_MODULE"; then
+	# By default we will add the -single_module flag. You can override
+	# by either setting the environment variable LT_MULTI_MODULE
+	# non-empty at configure time, or by adding -multi_module to the
+	# link flags.
+	rm -rf libconftest.dylib*
+	echo "int foo(void){return 1;}" > conftest.c
+	echo "$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
+-dynamiclib -Wl,-single_module conftest.c" >&5
+	$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
+	  -dynamiclib -Wl,-single_module conftest.c 2>conftest.err
+        _lt_result=$?
+	# If there is a non-empty error log, and "single_module"
+	# appears in it, assume the flag caused a linker warning
+        if test -s conftest.err && $GREP single_module conftest.err; then
+	  cat conftest.err >&5
+	# Otherwise, if the output was created with a 0 exit code from
+	# the compiler, it worked.
+	elif test -f libconftest.dylib && test 0 = "$_lt_result"; then
+	  lt_cv_apple_cc_single_mod=yes
+	else
+	  cat conftest.err >&5
+	fi
+	rm -rf libconftest.dylib*
+	rm -f conftest.*
+      fi
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_apple_cc_single_mod" >&5
+$as_echo "$lt_cv_apple_cc_single_mod" >&6; }
 
-if test $ac_cv_header_stdc = yes; then
-  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
-  if test "$cross_compiling" = yes; then :
-  :
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -exported_symbols_list linker flag" >&5
+$as_echo_n "checking for -exported_symbols_list linker flag... " >&6; }
+if ${lt_cv_ld_exported_symbols_list+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  lt_cv_ld_exported_symbols_list=no
+      save_LDFLAGS=$LDFLAGS
+      echo "_main" > conftest.sym
+      LDFLAGS="$LDFLAGS -Wl,-exported_symbols_list,conftest.sym"
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
-#include <ctype.h>
-#include <stdlib.h>
-#if ((' ' & 0x0FF) == 0x020)
-# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
-# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
-#else
-# define ISLOWER(c) \
-		   (('a' <= (c) && (c) <= 'i') \
-		     || ('j' <= (c) && (c) <= 'r') \
-		     || ('s' <= (c) && (c) <= 'z'))
-# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
-#endif
 
-#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
 int
 main ()
 {
-  int i;
-  for (i = 0; i < 256; i++)
-    if (XOR (islower (i), ISLOWER (i))
-	|| toupper (i) != TOUPPER (i))
-      return 2;
+
+  ;
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
-
+if ac_fn_c_try_link "$LINENO"; then :
+  lt_cv_ld_exported_symbols_list=yes
 else
-  ac_cv_header_stdc=no
-fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
-fi
+  lt_cv_ld_exported_symbols_list=no
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
-$as_echo "$ac_cv_header_stdc" >&6; }
-if test $ac_cv_header_stdc = yes; then
-
-$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	LDFLAGS=$save_LDFLAGS
 
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_ld_exported_symbols_list" >&5
+$as_echo "$lt_cv_ld_exported_symbols_list" >&6; }
 
-# On IRIX 5.3, sys/types and inttypes.h are conflicting.
-for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
-		  inttypes.h stdint.h unistd.h
-do :
-  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
-ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
-"
-if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
-  cat >>confdefs.h <<_ACEOF
-#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
-_ACEOF
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for -force_load linker flag" >&5
+$as_echo_n "checking for -force_load linker flag... " >&6; }
+if ${lt_cv_ld_force_load+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_ld_force_load=no
+      cat > conftest.c << _LT_EOF
+int forced_loaded() { return 2;}
+_LT_EOF
+      echo "$LTCC $LTCFLAGS -c -o conftest.o conftest.c" >&5
+      $LTCC $LTCFLAGS -c -o conftest.o conftest.c 2>&5
+      echo "$AR cru libconftest.a conftest.o" >&5
+      $AR cru libconftest.a conftest.o 2>&5
+      echo "$RANLIB libconftest.a" >&5
+      $RANLIB libconftest.a 2>&5
+      cat > conftest.c << _LT_EOF
+int main() { return 0;}
+_LT_EOF
+      echo "$LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a" >&5
+      $LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a 2>conftest.err
+      _lt_result=$?
+      if test -s conftest.err && $GREP force_load conftest.err; then
+	cat conftest.err >&5
+      elif test -f conftest && test 0 = "$_lt_result" && $GREP forced_load conftest >/dev/null 2>&1; then
+	lt_cv_ld_force_load=yes
+      else
+	cat conftest.err >&5
+      fi
+        rm -f conftest.err libconftest.a conftest conftest.c
+        rm -rf conftest.dSYM
 
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_ld_force_load" >&5
+$as_echo "$lt_cv_ld_force_load" >&6; }
+    case $host_os in
+    rhapsody* | darwin1.[012])
+      _lt_dar_allow_undefined='$wl-undefined ${wl}suppress' ;;
+    darwin1.*)
+      _lt_dar_allow_undefined='$wl-flat_namespace $wl-undefined ${wl}suppress' ;;
+    darwin*) # darwin 5.x on
+      # if running on 10.5 or later, the deployment target defaults
+      # to the OS version, if on x86, and 10.4, the deployment
+      # target defaults to 10.4. Don't you love it?
+      case ${MACOSX_DEPLOYMENT_TARGET-10.0},$host in
+	10.0,*86*-darwin8*|10.0,*-darwin[91]*)
+	  _lt_dar_allow_undefined='$wl-undefined ${wl}dynamic_lookup' ;;
+	10.[012][,.]*)
+	  _lt_dar_allow_undefined='$wl-flat_namespace $wl-undefined ${wl}suppress' ;;
+	10.*)
+	  _lt_dar_allow_undefined='$wl-undefined ${wl}dynamic_lookup' ;;
+      esac
+    ;;
+  esac
+    if test yes = "$lt_cv_apple_cc_single_mod"; then
+      _lt_dar_single_mod='$single_module'
+    fi
+    if test yes = "$lt_cv_ld_exported_symbols_list"; then
+      _lt_dar_export_syms=' $wl-exported_symbols_list,$output_objdir/$libname-symbols.expsym'
+    else
+      _lt_dar_export_syms='~$NMEDIT -s $output_objdir/$libname-symbols.expsym $lib'
+    fi
+    if test : != "$DSYMUTIL" && test no = "$lt_cv_ld_force_load"; then
+      _lt_dsymutil='~$DSYMUTIL $lib || :'
+    else
+      _lt_dsymutil=
+    fi
+    ;;
+  esac
 
-done
-
+# func_munge_path_list VARIABLE PATH
+# -----------------------------------
+# VARIABLE is name of variable containing _space_ separated list of
+# directories to be munged by the contents of PATH, which is string
+# having a format:
+# "DIR[:DIR]:"
+#       string "DIR[ DIR]" will be prepended to VARIABLE
+# ":DIR[:DIR]"
+#       string "DIR[ DIR]" will be appended to VARIABLE
+# "DIRP[:DIRP]::[DIRA:]DIRA"
+#       string "DIRP[ DIRP]" will be prepended to VARIABLE and string
+#       "DIRA[ DIRA]" will be appended to VARIABLE
+# "DIR[:DIR]"
+#       VARIABLE will be replaced by "DIR[ DIR]"
+func_munge_path_list ()
+{
+    case x$2 in
+    x)
+        ;;
+    *:)
+        eval $1=\"`$ECHO $2 | $SED 's/:/ /g'` \$$1\"
+        ;;
+    x:*)
+        eval $1=\"\$$1 `$ECHO $2 | $SED 's/:/ /g'`\"
+        ;;
+    *::*)
+        eval $1=\"\$$1\ `$ECHO $2 | $SED -e 's/.*:://' -e 's/:/ /g'`\"
+        eval $1=\"`$ECHO $2 | $SED -e 's/::.*//' -e 's/:/ /g'`\ \$$1\"
+        ;;
+    *)
+        eval $1=\"`$ECHO $2 | $SED 's/:/ /g'`\"
+        ;;
+    esac
+}
 
 for ac_header in dlfcn.h
 do :
@@ -17644,10 +20167,11 @@
 
 
 fi
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 CC=$lt_save_CC
 
@@ -20756,10 +23280,11 @@
   CFLAGS=$lt_save_CFLAGS
 fi # test yes != "$_lt_disable_FC"
 
-ac_ext=${ac_fc_srcext-f}
-ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
-ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 
@@ -20786,68 +23311,338 @@
 
 
 
-# Files:
-DX_PROJECT=ELPA
+# Files:
+DX_PROJECT=ELPA
+
+DX_CONFIG=Doxyfile
+
+DX_DOCDIR=docs
+
+
+# Environment variables used inside doxygen.cfg:
+DX_ENV="$DX_ENV SRCDIR='$srcdir'"
+
+DX_ENV="$DX_ENV PROJECT='$DX_PROJECT'"
+
+DX_ENV="$DX_ENV DOCDIR='$DX_DOCDIR'"
+
+DX_ENV="$DX_ENV VERSION='$PACKAGE_VERSION'"
+
+
+# Doxygen itself:
+
+
+
+    # Check whether --enable-doxygen-doc was given.
+if test "${enable_doxygen_doc+set}" = set; then :
+  enableval=$enable_doxygen_doc;
+case "$enableval" in
+#(
+y|Y|yes|Yes|YES)
+    DX_FLAG_doc=1
+
+
+;; #(
+n|N|no|No|NO)
+    DX_FLAG_doc=0
+
+;; #(
+*)
+    as_fn_error $? "invalid value '$enableval' given to doxygen-doc" "$LINENO" 5
+;;
+esac
+
+else
+
+DX_FLAG_doc=1
+
+
+
+fi
+
+if test "$DX_FLAG_doc" = 1; then
+
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}doxygen", so it can be a program name with args.
+set dummy ${ac_tool_prefix}doxygen; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_DX_DOXYGEN+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $DX_DOXYGEN in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_DX_DOXYGEN="$DX_DOXYGEN" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_DX_DOXYGEN="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+DX_DOXYGEN=$ac_cv_path_DX_DOXYGEN
+if test -n "$DX_DOXYGEN"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_DOXYGEN" >&5
+$as_echo "$DX_DOXYGEN" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_path_DX_DOXYGEN"; then
+  ac_pt_DX_DOXYGEN=$DX_DOXYGEN
+  # Extract the first word of "doxygen", so it can be a program name with args.
+set dummy doxygen; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_ac_pt_DX_DOXYGEN+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $ac_pt_DX_DOXYGEN in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_ac_pt_DX_DOXYGEN="$ac_pt_DX_DOXYGEN" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_ac_pt_DX_DOXYGEN="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+ac_pt_DX_DOXYGEN=$ac_cv_path_ac_pt_DX_DOXYGEN
+if test -n "$ac_pt_DX_DOXYGEN"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_DOXYGEN" >&5
+$as_echo "$ac_pt_DX_DOXYGEN" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_pt_DX_DOXYGEN" = x; then
+    DX_DOXYGEN=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    DX_DOXYGEN=$ac_pt_DX_DOXYGEN
+  fi
+else
+  DX_DOXYGEN="$ac_cv_path_DX_DOXYGEN"
+fi
+
+if test "$DX_FLAG_doc$DX_DOXYGEN" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: doxygen not found - will not generate any doxygen documentation" >&5
+$as_echo "$as_me: WARNING: doxygen not found - will not generate any doxygen documentation" >&2;}
+    DX_FLAG_doc=0
+
+fi
+
+
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}perl", so it can be a program name with args.
+set dummy ${ac_tool_prefix}perl; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_DX_PERL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $DX_PERL in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_DX_PERL="$DX_PERL" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_DX_PERL="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+DX_PERL=$ac_cv_path_DX_PERL
+if test -n "$DX_PERL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_PERL" >&5
+$as_echo "$DX_PERL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_path_DX_PERL"; then
+  ac_pt_DX_PERL=$DX_PERL
+  # Extract the first word of "perl", so it can be a program name with args.
+set dummy perl; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_ac_pt_DX_PERL+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $ac_pt_DX_PERL in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_ac_pt_DX_PERL="$ac_pt_DX_PERL" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_ac_pt_DX_PERL="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+ac_pt_DX_PERL=$ac_cv_path_ac_pt_DX_PERL
+if test -n "$ac_pt_DX_PERL"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_PERL" >&5
+$as_echo "$ac_pt_DX_PERL" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
 
-DX_CONFIG=Doxyfile
+  if test "x$ac_pt_DX_PERL" = x; then
+    DX_PERL=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    DX_PERL=$ac_pt_DX_PERL
+  fi
+else
+  DX_PERL="$ac_cv_path_DX_PERL"
+fi
 
-DX_DOCDIR=docs
+if test "$DX_FLAG_doc$DX_PERL" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: perl not found - will not generate any doxygen documentation" >&5
+$as_echo "$as_me: WARNING: perl not found - will not generate any doxygen documentation" >&2;}
+    DX_FLAG_doc=0
 
+fi
 
-# Environment variables used inside doxygen.cfg:
-DX_ENV="$DX_ENV SRCDIR='$srcdir'"
+    :
+fi
+ if test "$DX_FLAG_doc" = 1; then
+  DX_COND_doc_TRUE=
+  DX_COND_doc_FALSE='#'
+else
+  DX_COND_doc_TRUE='#'
+  DX_COND_doc_FALSE=
+fi
 
-DX_ENV="$DX_ENV PROJECT='$DX_PROJECT'"
+if test "$DX_FLAG_doc" = 1; then
+    DX_ENV="$DX_ENV PERL_PATH='$DX_PERL'"
 
-DX_ENV="$DX_ENV DOCDIR='$DX_DOCDIR'"
+    :
+else
 
-DX_ENV="$DX_ENV VERSION='$PACKAGE_VERSION'"
+    :
+fi
 
 
-# Doxygen itself:
+# Dot for graphics:
 
 
 
-    # Check whether --enable-doxygen-doc was given.
-if test "${enable_doxygen_doc+set}" = set; then :
-  enableval=$enable_doxygen_doc;
+    # Check whether --enable-doxygen-dot was given.
+if test "${enable_doxygen_dot+set}" = set; then :
+  enableval=$enable_doxygen_dot;
 case "$enableval" in
 #(
 y|Y|yes|Yes|YES)
-    DX_FLAG_doc=1
+    DX_FLAG_dot=1
 
 
+test "$DX_FLAG_doc" = "1" \
+|| as_fn_error $? "doxygen-dot requires doxygen-dot" "$LINENO" 5
+
 ;; #(
 n|N|no|No|NO)
-    DX_FLAG_doc=0
+    DX_FLAG_dot=0
 
 ;; #(
 *)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-doc" "$LINENO" 5
+    as_fn_error $? "invalid value '$enableval' given to doxygen-dot" "$LINENO" 5
 ;;
 esac
 
 else
 
-DX_FLAG_doc=1
+DX_FLAG_dot=0
+
+
+test "$DX_FLAG_doc" = "1" || DX_FLAG_dot=0
 
 
 
 fi
 
-if test "$DX_FLAG_doc" = 1; then
+if test "$DX_FLAG_dot" = 1; then
 
 if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}doxygen", so it can be a program name with args.
-set dummy ${ac_tool_prefix}doxygen; ac_word=$2
+  # Extract the first word of "${ac_tool_prefix}dot", so it can be a program name with args.
+set dummy ${ac_tool_prefix}dot; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_DOXYGEN+:} false; then :
+if ${ac_cv_path_DX_DOT+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $DX_DOXYGEN in
+  case $DX_DOT in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_DOXYGEN="$DX_DOXYGEN" # Let the user override the test with a path.
+  ac_cv_path_DX_DOT="$DX_DOT" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -20857,7 +23652,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_DOXYGEN="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_DX_DOT="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -20868,10 +23663,10 @@
   ;;
 esac
 fi
-DX_DOXYGEN=$ac_cv_path_DX_DOXYGEN
-if test -n "$DX_DOXYGEN"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_DOXYGEN" >&5
-$as_echo "$DX_DOXYGEN" >&6; }
+DX_DOT=$ac_cv_path_DX_DOT
+if test -n "$DX_DOT"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_DOT" >&5
+$as_echo "$DX_DOT" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
@@ -20879,18 +23674,18 @@
 
 
 fi
-if test -z "$ac_cv_path_DX_DOXYGEN"; then
-  ac_pt_DX_DOXYGEN=$DX_DOXYGEN
-  # Extract the first word of "doxygen", so it can be a program name with args.
-set dummy doxygen; ac_word=$2
+if test -z "$ac_cv_path_DX_DOT"; then
+  ac_pt_DX_DOT=$DX_DOT
+  # Extract the first word of "dot", so it can be a program name with args.
+set dummy dot; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_DOXYGEN+:} false; then :
+if ${ac_cv_path_ac_pt_DX_DOT+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_DOXYGEN in
+  case $ac_pt_DX_DOT in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_DOXYGEN="$ac_pt_DX_DOXYGEN" # Let the user override the test with a path.
+  ac_cv_path_ac_pt_DX_DOT="$ac_pt_DX_DOT" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -20900,7 +23695,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_DOXYGEN="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_ac_pt_DX_DOT="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -20911,17 +23706,17 @@
   ;;
 esac
 fi
-ac_pt_DX_DOXYGEN=$ac_cv_path_ac_pt_DX_DOXYGEN
-if test -n "$ac_pt_DX_DOXYGEN"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_DOXYGEN" >&5
-$as_echo "$ac_pt_DX_DOXYGEN" >&6; }
+ac_pt_DX_DOT=$ac_cv_path_ac_pt_DX_DOT
+if test -n "$ac_pt_DX_DOT"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_DOT" >&5
+$as_echo "$ac_pt_DX_DOT" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_DOXYGEN" = x; then
-    DX_DOXYGEN=""
+  if test "x$ac_pt_DX_DOT" = x; then
+    DX_DOT=""
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -20929,195 +23724,272 @@
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    DX_DOXYGEN=$ac_pt_DX_DOXYGEN
+    DX_DOT=$ac_pt_DX_DOT
   fi
 else
-  DX_DOXYGEN="$ac_cv_path_DX_DOXYGEN"
+  DX_DOT="$ac_cv_path_DX_DOT"
+fi
+
+if test "$DX_FLAG_dot$DX_DOT" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: dot not found - will not generate graphics for doxygen documentation" >&5
+$as_echo "$as_me: WARNING: dot not found - will not generate graphics for doxygen documentation" >&2;}
+    DX_FLAG_dot=0
+
+fi
+
+    :
+fi
+ if test "$DX_FLAG_dot" = 1; then
+  DX_COND_dot_TRUE=
+  DX_COND_dot_FALSE='#'
+else
+  DX_COND_dot_TRUE='#'
+  DX_COND_dot_FALSE=
+fi
+
+if test "$DX_FLAG_dot" = 1; then
+    DX_ENV="$DX_ENV HAVE_DOT='YES'"
+
+             DX_ENV="$DX_ENV DOT_PATH='`expr ".$DX_DOT" : '\(\.\)[^/]*$' \| "x$DX_DOT" : 'x\(.*\)/[^/]*$'`'"
+
+    :
+else
+    DX_ENV="$DX_ENV HAVE_DOT='NO'"
+
+    :
+fi
+
+
+# Man pages generation:
+
+
+
+    # Check whether --enable-doxygen-man was given.
+if test "${enable_doxygen_man+set}" = set; then :
+  enableval=$enable_doxygen_man;
+case "$enableval" in
+#(
+y|Y|yes|Yes|YES)
+    DX_FLAG_man=1
+
+
+test "$DX_FLAG_doc" = "1" \
+|| as_fn_error $? "doxygen-man requires doxygen-man" "$LINENO" 5
+
+;; #(
+n|N|no|No|NO)
+    DX_FLAG_man=0
+
+;; #(
+*)
+    as_fn_error $? "invalid value '$enableval' given to doxygen-man" "$LINENO" 5
+;;
+esac
+
+else
+
+DX_FLAG_man=1
+
+
+test "$DX_FLAG_doc" = "1" || DX_FLAG_man=0
+
+
+
+fi
+
+if test "$DX_FLAG_man" = 1; then
+
+    :
+fi
+ if test "$DX_FLAG_man" = 1; then
+  DX_COND_man_TRUE=
+  DX_COND_man_FALSE='#'
+else
+  DX_COND_man_TRUE='#'
+  DX_COND_man_FALSE=
+fi
+
+if test "$DX_FLAG_man" = 1; then
+    DX_ENV="$DX_ENV GENERATE_MAN='YES'"
+
+    :
+else
+    DX_ENV="$DX_ENV GENERATE_MAN='NO'"
+
+    :
 fi
 
-if test "$DX_FLAG_doc$DX_DOXYGEN" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: doxygen not found - will not generate any doxygen documentation" >&5
-$as_echo "$as_me: WARNING: doxygen not found - will not generate any doxygen documentation" >&2;}
-    DX_FLAG_doc=0
 
-fi
+# RTF file generation:
+
+
+
+    # Check whether --enable-doxygen-rtf was given.
+if test "${enable_doxygen_rtf+set}" = set; then :
+  enableval=$enable_doxygen_rtf;
+case "$enableval" in
+#(
+y|Y|yes|Yes|YES)
+    DX_FLAG_rtf=1
+
+
+test "$DX_FLAG_doc" = "1" \
+|| as_fn_error $? "doxygen-rtf requires doxygen-rtf" "$LINENO" 5
+
+;; #(
+n|N|no|No|NO)
+    DX_FLAG_rtf=0
+
+;; #(
+*)
+    as_fn_error $? "invalid value '$enableval' given to doxygen-rtf" "$LINENO" 5
+;;
+esac
+
+else
+
+DX_FLAG_rtf=0
+
+
+test "$DX_FLAG_doc" = "1" || DX_FLAG_rtf=0
 
 
-if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}perl", so it can be a program name with args.
-set dummy ${ac_tool_prefix}perl; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_PERL+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  case $DX_PERL in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_PERL="$DX_PERL" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_PERL="$as_dir/$ac_word$ac_exec_ext"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
 
-  ;;
-esac
-fi
-DX_PERL=$ac_cv_path_DX_PERL
-if test -n "$DX_PERL"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_PERL" >&5
-$as_echo "$DX_PERL" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
 fi
 
+if test "$DX_FLAG_rtf" = 1; then
 
+    :
 fi
-if test -z "$ac_cv_path_DX_PERL"; then
-  ac_pt_DX_PERL=$DX_PERL
-  # Extract the first word of "perl", so it can be a program name with args.
-set dummy perl; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_PERL+:} false; then :
-  $as_echo_n "(cached) " >&6
+ if test "$DX_FLAG_rtf" = 1; then
+  DX_COND_rtf_TRUE=
+  DX_COND_rtf_FALSE='#'
 else
-  case $ac_pt_DX_PERL in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_PERL="$ac_pt_DX_PERL" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_PERL="$as_dir/$ac_word$ac_exec_ext"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-  ;;
-esac
+  DX_COND_rtf_TRUE='#'
+  DX_COND_rtf_FALSE=
 fi
-ac_pt_DX_PERL=$ac_cv_path_ac_pt_DX_PERL
-if test -n "$ac_pt_DX_PERL"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_PERL" >&5
-$as_echo "$ac_pt_DX_PERL" >&6; }
+
+if test "$DX_FLAG_rtf" = 1; then
+    DX_ENV="$DX_ENV GENERATE_RTF='YES'"
+
+    :
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+    DX_ENV="$DX_ENV GENERATE_RTF='NO'"
+
+    :
 fi
 
-  if test "x$ac_pt_DX_PERL" = x; then
-    DX_PERL=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
+
+# XML file generation:
+
+
+
+    # Check whether --enable-doxygen-xml was given.
+if test "${enable_doxygen_xml+set}" = set; then :
+  enableval=$enable_doxygen_xml;
+case "$enableval" in
+#(
+y|Y|yes|Yes|YES)
+    DX_FLAG_xml=1
+
+
+test "$DX_FLAG_doc" = "1" \
+|| as_fn_error $? "doxygen-xml requires doxygen-xml" "$LINENO" 5
+
+;; #(
+n|N|no|No|NO)
+    DX_FLAG_xml=0
+
+;; #(
+*)
+    as_fn_error $? "invalid value '$enableval' given to doxygen-xml" "$LINENO" 5
+;;
 esac
-    DX_PERL=$ac_pt_DX_PERL
-  fi
+
 else
-  DX_PERL="$ac_cv_path_DX_PERL"
-fi
 
-if test "$DX_FLAG_doc$DX_PERL" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: perl not found - will not generate any doxygen documentation" >&5
-$as_echo "$as_me: WARNING: perl not found - will not generate any doxygen documentation" >&2;}
-    DX_FLAG_doc=0
+DX_FLAG_xml=0
+
+
+test "$DX_FLAG_doc" = "1" || DX_FLAG_xml=0
+
+
 
 fi
 
+if test "$DX_FLAG_xml" = 1; then
+
     :
 fi
- if test "$DX_FLAG_doc" = 1; then
-  DX_COND_doc_TRUE=
-  DX_COND_doc_FALSE='#'
+ if test "$DX_FLAG_xml" = 1; then
+  DX_COND_xml_TRUE=
+  DX_COND_xml_FALSE='#'
 else
-  DX_COND_doc_TRUE='#'
-  DX_COND_doc_FALSE=
+  DX_COND_xml_TRUE='#'
+  DX_COND_xml_FALSE=
 fi
 
-if test "$DX_FLAG_doc" = 1; then
-    DX_ENV="$DX_ENV PERL_PATH='$DX_PERL'"
+if test "$DX_FLAG_xml" = 1; then
+    DX_ENV="$DX_ENV GENERATE_XML='YES'"
 
     :
 else
+    DX_ENV="$DX_ENV GENERATE_XML='NO'"
 
     :
 fi
 
 
-# Dot for graphics:
+# (Compressed) HTML help generation:
 
 
 
-    # Check whether --enable-doxygen-dot was given.
-if test "${enable_doxygen_dot+set}" = set; then :
-  enableval=$enable_doxygen_dot;
+    # Check whether --enable-doxygen-chm was given.
+if test "${enable_doxygen_chm+set}" = set; then :
+  enableval=$enable_doxygen_chm;
 case "$enableval" in
 #(
 y|Y|yes|Yes|YES)
-    DX_FLAG_dot=1
+    DX_FLAG_chm=1
 
 
 test "$DX_FLAG_doc" = "1" \
-|| as_fn_error $? "doxygen-dot requires doxygen-dot" "$LINENO" 5
+|| as_fn_error $? "doxygen-chm requires doxygen-chm" "$LINENO" 5
 
 ;; #(
 n|N|no|No|NO)
-    DX_FLAG_dot=0
+    DX_FLAG_chm=0
 
 ;; #(
 *)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-dot" "$LINENO" 5
+    as_fn_error $? "invalid value '$enableval' given to doxygen-chm" "$LINENO" 5
 ;;
 esac
 
 else
 
-DX_FLAG_dot=0
+DX_FLAG_chm=0
 
 
-test "$DX_FLAG_doc" = "1" || DX_FLAG_dot=0
+test "$DX_FLAG_doc" = "1" || DX_FLAG_chm=0
 
 
 
 fi
 
-if test "$DX_FLAG_dot" = 1; then
+if test "$DX_FLAG_chm" = 1; then
 
 if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}dot", so it can be a program name with args.
-set dummy ${ac_tool_prefix}dot; ac_word=$2
+  # Extract the first word of "${ac_tool_prefix}hhc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}hhc; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_DOT+:} false; then :
+if ${ac_cv_path_DX_HHC+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $DX_DOT in
+  case $DX_HHC in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_DOT="$DX_DOT" # Let the user override the test with a path.
+  ac_cv_path_DX_HHC="$DX_HHC" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21127,7 +23999,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_DOT="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_DX_HHC="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21138,10 +24010,10 @@
   ;;
 esac
 fi
-DX_DOT=$ac_cv_path_DX_DOT
-if test -n "$DX_DOT"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_DOT" >&5
-$as_echo "$DX_DOT" >&6; }
+DX_HHC=$ac_cv_path_DX_HHC
+if test -n "$DX_HHC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_HHC" >&5
+$as_echo "$DX_HHC" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
@@ -21149,18 +24021,18 @@
 
 
 fi
-if test -z "$ac_cv_path_DX_DOT"; then
-  ac_pt_DX_DOT=$DX_DOT
-  # Extract the first word of "dot", so it can be a program name with args.
-set dummy dot; ac_word=$2
+if test -z "$ac_cv_path_DX_HHC"; then
+  ac_pt_DX_HHC=$DX_HHC
+  # Extract the first word of "hhc", so it can be a program name with args.
+set dummy hhc; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_DOT+:} false; then :
+if ${ac_cv_path_ac_pt_DX_HHC+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_DOT in
+  case $ac_pt_DX_HHC in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_DOT="$ac_pt_DX_DOT" # Let the user override the test with a path.
+  ac_cv_path_ac_pt_DX_HHC="$ac_pt_DX_HHC" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21170,7 +24042,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_DOT="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_ac_pt_DX_HHC="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21181,17 +24053,17 @@
   ;;
 esac
 fi
-ac_pt_DX_DOT=$ac_cv_path_ac_pt_DX_DOT
-if test -n "$ac_pt_DX_DOT"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_DOT" >&5
-$as_echo "$ac_pt_DX_DOT" >&6; }
+ac_pt_DX_HHC=$ac_cv_path_ac_pt_DX_HHC
+if test -n "$ac_pt_DX_HHC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_HHC" >&5
+$as_echo "$ac_pt_DX_HHC" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_DOT" = x; then
-    DX_DOT=""
+  if test "x$ac_pt_DX_HHC" = x; then
+    DX_HHC=""
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -21199,272 +24071,326 @@
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    DX_DOT=$ac_pt_DX_DOT
+    DX_HHC=$ac_pt_DX_HHC
   fi
 else
-  DX_DOT="$ac_cv_path_DX_DOT"
+  DX_HHC="$ac_cv_path_DX_HHC"
 fi
 
-if test "$DX_FLAG_dot$DX_DOT" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: dot not found - will not generate graphics for doxygen documentation" >&5
-$as_echo "$as_me: WARNING: dot not found - will not generate graphics for doxygen documentation" >&2;}
-    DX_FLAG_dot=0
+if test "$DX_FLAG_chm$DX_HHC" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: hhc not found - will not generate doxygen compressed HTML help documentation" >&5
+$as_echo "$as_me: WARNING: hhc not found - will not generate doxygen compressed HTML help documentation" >&2;}
+    DX_FLAG_chm=0
 
 fi
 
     :
 fi
- if test "$DX_FLAG_dot" = 1; then
-  DX_COND_dot_TRUE=
-  DX_COND_dot_FALSE='#'
+ if test "$DX_FLAG_chm" = 1; then
+  DX_COND_chm_TRUE=
+  DX_COND_chm_FALSE='#'
 else
-  DX_COND_dot_TRUE='#'
-  DX_COND_dot_FALSE=
+  DX_COND_chm_TRUE='#'
+  DX_COND_chm_FALSE=
 fi
 
-if test "$DX_FLAG_dot" = 1; then
-    DX_ENV="$DX_ENV HAVE_DOT='YES'"
+if test "$DX_FLAG_chm" = 1; then
+    DX_ENV="$DX_ENV HHC_PATH='$DX_HHC'"
+
+             DX_ENV="$DX_ENV GENERATE_HTML='YES'"
 
-             DX_ENV="$DX_ENV DOT_PATH='`expr ".$DX_DOT" : '\(\.\)[^/]*$' \| "x$DX_DOT" : 'x\(.*\)/[^/]*$'`'"
+             DX_ENV="$DX_ENV GENERATE_HTMLHELP='YES'"
 
     :
 else
-    DX_ENV="$DX_ENV HAVE_DOT='NO'"
+    DX_ENV="$DX_ENV GENERATE_HTMLHELP='NO'"
 
     :
 fi
 
 
-# Man pages generation:
+# Seperate CHI file generation.
 
 
 
-    # Check whether --enable-doxygen-man was given.
-if test "${enable_doxygen_man+set}" = set; then :
-  enableval=$enable_doxygen_man;
+    # Check whether --enable-doxygen-chi was given.
+if test "${enable_doxygen_chi+set}" = set; then :
+  enableval=$enable_doxygen_chi;
 case "$enableval" in
 #(
 y|Y|yes|Yes|YES)
-    DX_FLAG_man=1
+    DX_FLAG_chi=1
 
 
-test "$DX_FLAG_doc" = "1" \
-|| as_fn_error $? "doxygen-man requires doxygen-man" "$LINENO" 5
+test "$DX_FLAG_chm" = "1" \
+|| as_fn_error $? "doxygen-chi requires doxygen-chi" "$LINENO" 5
 
 ;; #(
 n|N|no|No|NO)
-    DX_FLAG_man=0
+    DX_FLAG_chi=0
 
 ;; #(
 *)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-man" "$LINENO" 5
+    as_fn_error $? "invalid value '$enableval' given to doxygen-chi" "$LINENO" 5
 ;;
 esac
 
 else
 
-DX_FLAG_man=1
+DX_FLAG_chi=0
 
 
-test "$DX_FLAG_doc" = "1" || DX_FLAG_man=0
+test "$DX_FLAG_chm" = "1" || DX_FLAG_chi=0
 
 
 
 fi
 
-if test "$DX_FLAG_man" = 1; then
+if test "$DX_FLAG_chi" = 1; then
 
     :
 fi
- if test "$DX_FLAG_man" = 1; then
-  DX_COND_man_TRUE=
-  DX_COND_man_FALSE='#'
+ if test "$DX_FLAG_chi" = 1; then
+  DX_COND_chi_TRUE=
+  DX_COND_chi_FALSE='#'
 else
-  DX_COND_man_TRUE='#'
-  DX_COND_man_FALSE=
+  DX_COND_chi_TRUE='#'
+  DX_COND_chi_FALSE=
 fi
 
-if test "$DX_FLAG_man" = 1; then
-    DX_ENV="$DX_ENV GENERATE_MAN='YES'"
+if test "$DX_FLAG_chi" = 1; then
+    DX_ENV="$DX_ENV GENERATE_CHI='YES'"
 
     :
 else
-    DX_ENV="$DX_ENV GENERATE_MAN='NO'"
+    DX_ENV="$DX_ENV GENERATE_CHI='NO'"
 
     :
 fi
 
 
-# RTF file generation:
+# Plain HTML pages generation:
 
 
 
-    # Check whether --enable-doxygen-rtf was given.
-if test "${enable_doxygen_rtf+set}" = set; then :
-  enableval=$enable_doxygen_rtf;
+    # Check whether --enable-doxygen-html was given.
+if test "${enable_doxygen_html+set}" = set; then :
+  enableval=$enable_doxygen_html;
 case "$enableval" in
 #(
 y|Y|yes|Yes|YES)
-    DX_FLAG_rtf=1
+    DX_FLAG_html=1
 
 
 test "$DX_FLAG_doc" = "1" \
-|| as_fn_error $? "doxygen-rtf requires doxygen-rtf" "$LINENO" 5
+|| as_fn_error $? "doxygen-html requires doxygen-html" "$LINENO" 5
+
+test "$DX_FLAG_chm" = "0" \
+|| as_fn_error $? "doxygen-html contradicts doxygen-html" "$LINENO" 5
 
 ;; #(
 n|N|no|No|NO)
-    DX_FLAG_rtf=0
+    DX_FLAG_html=0
 
 ;; #(
 *)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-rtf" "$LINENO" 5
+    as_fn_error $? "invalid value '$enableval' given to doxygen-html" "$LINENO" 5
 ;;
 esac
 
 else
 
-DX_FLAG_rtf=0
+DX_FLAG_html=1
 
 
-test "$DX_FLAG_doc" = "1" || DX_FLAG_rtf=0
+test "$DX_FLAG_doc" = "1" || DX_FLAG_html=0
+
+
+test "$DX_FLAG_chm" = "0" || DX_FLAG_html=0
 
 
 
 fi
 
-if test "$DX_FLAG_rtf" = 1; then
+if test "$DX_FLAG_html" = 1; then
 
     :
 fi
- if test "$DX_FLAG_rtf" = 1; then
-  DX_COND_rtf_TRUE=
-  DX_COND_rtf_FALSE='#'
+ if test "$DX_FLAG_html" = 1; then
+  DX_COND_html_TRUE=
+  DX_COND_html_FALSE='#'
 else
-  DX_COND_rtf_TRUE='#'
-  DX_COND_rtf_FALSE=
+  DX_COND_html_TRUE='#'
+  DX_COND_html_FALSE=
 fi
 
-if test "$DX_FLAG_rtf" = 1; then
-    DX_ENV="$DX_ENV GENERATE_RTF='YES'"
+if test "$DX_FLAG_html" = 1; then
+    DX_ENV="$DX_ENV GENERATE_HTML='YES'"
 
     :
 else
-    DX_ENV="$DX_ENV GENERATE_RTF='NO'"
+    test "$DX_FLAG_chm" = 1 || DX_ENV="$DX_ENV GENERATE_HTML='NO'"
 
     :
 fi
 
 
-# XML file generation:
+# PostScript file generation:
 
 
 
-    # Check whether --enable-doxygen-xml was given.
-if test "${enable_doxygen_xml+set}" = set; then :
-  enableval=$enable_doxygen_xml;
+    # Check whether --enable-doxygen-ps was given.
+if test "${enable_doxygen_ps+set}" = set; then :
+  enableval=$enable_doxygen_ps;
 case "$enableval" in
 #(
 y|Y|yes|Yes|YES)
-    DX_FLAG_xml=1
+    DX_FLAG_ps=1
 
 
 test "$DX_FLAG_doc" = "1" \
-|| as_fn_error $? "doxygen-xml requires doxygen-xml" "$LINENO" 5
+|| as_fn_error $? "doxygen-ps requires doxygen-ps" "$LINENO" 5
 
 ;; #(
 n|N|no|No|NO)
-    DX_FLAG_xml=0
+    DX_FLAG_ps=0
 
 ;; #(
 *)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-xml" "$LINENO" 5
+    as_fn_error $? "invalid value '$enableval' given to doxygen-ps" "$LINENO" 5
 ;;
 esac
 
 else
 
-DX_FLAG_xml=0
+DX_FLAG_ps=0
 
 
-test "$DX_FLAG_doc" = "1" || DX_FLAG_xml=0
+test "$DX_FLAG_doc" = "1" || DX_FLAG_ps=0
 
 
 
 fi
 
-if test "$DX_FLAG_xml" = 1; then
+if test "$DX_FLAG_ps" = 1; then
 
-    :
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}latex", so it can be a program name with args.
+set dummy ${ac_tool_prefix}latex; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_DX_LATEX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $DX_LATEX in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_DX_LATEX="$DX_LATEX" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_DX_LATEX="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
 fi
- if test "$DX_FLAG_xml" = 1; then
-  DX_COND_xml_TRUE=
-  DX_COND_xml_FALSE='#'
+DX_LATEX=$ac_cv_path_DX_LATEX
+if test -n "$DX_LATEX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_LATEX" >&5
+$as_echo "$DX_LATEX" >&6; }
 else
-  DX_COND_xml_TRUE='#'
-  DX_COND_xml_FALSE=
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
-if test "$DX_FLAG_xml" = 1; then
-    DX_ENV="$DX_ENV GENERATE_XML='YES'"
 
-    :
+fi
+if test -z "$ac_cv_path_DX_LATEX"; then
+  ac_pt_DX_LATEX=$DX_LATEX
+  # Extract the first word of "latex", so it can be a program name with args.
+set dummy latex; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_ac_pt_DX_LATEX+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-    DX_ENV="$DX_ENV GENERATE_XML='NO'"
+  case $ac_pt_DX_LATEX in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_ac_pt_DX_LATEX="$ac_pt_DX_LATEX" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_ac_pt_DX_LATEX="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
-    :
+  ;;
+esac
+fi
+ac_pt_DX_LATEX=$ac_cv_path_ac_pt_DX_LATEX
+if test -n "$ac_pt_DX_LATEX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_LATEX" >&5
+$as_echo "$ac_pt_DX_LATEX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
-
-# (Compressed) HTML help generation:
-
-
-
-    # Check whether --enable-doxygen-chm was given.
-if test "${enable_doxygen_chm+set}" = set; then :
-  enableval=$enable_doxygen_chm;
-case "$enableval" in
-#(
-y|Y|yes|Yes|YES)
-    DX_FLAG_chm=1
-
-
-test "$DX_FLAG_doc" = "1" \
-|| as_fn_error $? "doxygen-chm requires doxygen-chm" "$LINENO" 5
-
-;; #(
-n|N|no|No|NO)
-    DX_FLAG_chm=0
-
-;; #(
-*)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-chm" "$LINENO" 5
-;;
+  if test "x$ac_pt_DX_LATEX" = x; then
+    DX_LATEX=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
 esac
-
+    DX_LATEX=$ac_pt_DX_LATEX
+  fi
 else
+  DX_LATEX="$ac_cv_path_DX_LATEX"
+fi
 
-DX_FLAG_chm=0
-
-
-test "$DX_FLAG_doc" = "1" || DX_FLAG_chm=0
-
-
+if test "$DX_FLAG_ps$DX_LATEX" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: latex not found - will not generate doxygen PostScript documentation" >&5
+$as_echo "$as_me: WARNING: latex not found - will not generate doxygen PostScript documentation" >&2;}
+    DX_FLAG_ps=0
 
 fi
 
-if test "$DX_FLAG_chm" = 1; then
 
 if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}hhc", so it can be a program name with args.
-set dummy ${ac_tool_prefix}hhc; ac_word=$2
+  # Extract the first word of "${ac_tool_prefix}makeindex", so it can be a program name with args.
+set dummy ${ac_tool_prefix}makeindex; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_HHC+:} false; then :
+if ${ac_cv_path_DX_MAKEINDEX+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $DX_HHC in
+  case $DX_MAKEINDEX in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_HHC="$DX_HHC" # Let the user override the test with a path.
+  ac_cv_path_DX_MAKEINDEX="$DX_MAKEINDEX" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21474,7 +24400,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_HHC="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_DX_MAKEINDEX="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21485,10 +24411,10 @@
   ;;
 esac
 fi
-DX_HHC=$ac_cv_path_DX_HHC
-if test -n "$DX_HHC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_HHC" >&5
-$as_echo "$DX_HHC" >&6; }
+DX_MAKEINDEX=$ac_cv_path_DX_MAKEINDEX
+if test -n "$DX_MAKEINDEX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_MAKEINDEX" >&5
+$as_echo "$DX_MAKEINDEX" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
@@ -21496,18 +24422,18 @@
 
 
 fi
-if test -z "$ac_cv_path_DX_HHC"; then
-  ac_pt_DX_HHC=$DX_HHC
-  # Extract the first word of "hhc", so it can be a program name with args.
-set dummy hhc; ac_word=$2
+if test -z "$ac_cv_path_DX_MAKEINDEX"; then
+  ac_pt_DX_MAKEINDEX=$DX_MAKEINDEX
+  # Extract the first word of "makeindex", so it can be a program name with args.
+set dummy makeindex; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_HHC+:} false; then :
+if ${ac_cv_path_ac_pt_DX_MAKEINDEX+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_HHC in
+  case $ac_pt_DX_MAKEINDEX in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_HHC="$ac_pt_DX_HHC" # Let the user override the test with a path.
+  ac_cv_path_ac_pt_DX_MAKEINDEX="$ac_pt_DX_MAKEINDEX" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21517,7 +24443,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_HHC="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_ac_pt_DX_MAKEINDEX="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21528,17 +24454,17 @@
   ;;
 esac
 fi
-ac_pt_DX_HHC=$ac_cv_path_ac_pt_DX_HHC
-if test -n "$ac_pt_DX_HHC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_HHC" >&5
-$as_echo "$ac_pt_DX_HHC" >&6; }
+ac_pt_DX_MAKEINDEX=$ac_cv_path_ac_pt_DX_MAKEINDEX
+if test -n "$ac_pt_DX_MAKEINDEX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_MAKEINDEX" >&5
+$as_echo "$ac_pt_DX_MAKEINDEX" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_HHC" = x; then
-    DX_HHC=""
+  if test "x$ac_pt_DX_MAKEINDEX" = x; then
+    DX_MAKEINDEX=""
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -21546,220 +24472,137 @@
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    DX_HHC=$ac_pt_DX_HHC
+    DX_MAKEINDEX=$ac_pt_DX_MAKEINDEX
   fi
 else
-  DX_HHC="$ac_cv_path_DX_HHC"
+  DX_MAKEINDEX="$ac_cv_path_DX_MAKEINDEX"
 fi
 
-if test "$DX_FLAG_chm$DX_HHC" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: hhc not found - will not generate doxygen compressed HTML help documentation" >&5
-$as_echo "$as_me: WARNING: hhc not found - will not generate doxygen compressed HTML help documentation" >&2;}
-    DX_FLAG_chm=0
-
-fi
+if test "$DX_FLAG_ps$DX_MAKEINDEX" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: makeindex not found - will not generate doxygen PostScript documentation" >&5
+$as_echo "$as_me: WARNING: makeindex not found - will not generate doxygen PostScript documentation" >&2;}
+    DX_FLAG_ps=0
 
-    :
-fi
- if test "$DX_FLAG_chm" = 1; then
-  DX_COND_chm_TRUE=
-  DX_COND_chm_FALSE='#'
-else
-  DX_COND_chm_TRUE='#'
-  DX_COND_chm_FALSE=
 fi
 
-if test "$DX_FLAG_chm" = 1; then
-    DX_ENV="$DX_ENV HHC_PATH='$DX_HHC'"
-
-             DX_ENV="$DX_ENV GENERATE_HTML='YES'"
-
-             DX_ENV="$DX_ENV GENERATE_HTMLHELP='YES'"
 
-    :
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}dvips", so it can be a program name with args.
+set dummy ${ac_tool_prefix}dvips; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_DX_DVIPS+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-    DX_ENV="$DX_ENV GENERATE_HTMLHELP='NO'"
-
-    :
-fi
-
-
-# Seperate CHI file generation.
-
-
-
-    # Check whether --enable-doxygen-chi was given.
-if test "${enable_doxygen_chi+set}" = set; then :
-  enableval=$enable_doxygen_chi;
-case "$enableval" in
-#(
-y|Y|yes|Yes|YES)
-    DX_FLAG_chi=1
-
-
-test "$DX_FLAG_chm" = "1" \
-|| as_fn_error $? "doxygen-chi requires doxygen-chi" "$LINENO" 5
-
-;; #(
-n|N|no|No|NO)
-    DX_FLAG_chi=0
+  case $DX_DVIPS in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_DX_DVIPS="$DX_DVIPS" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_DX_DVIPS="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
-;; #(
-*)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-chi" "$LINENO" 5
-;;
+  ;;
 esac
-
-else
-
-DX_FLAG_chi=0
-
-
-test "$DX_FLAG_chm" = "1" || DX_FLAG_chi=0
-
-
-
-fi
-
-if test "$DX_FLAG_chi" = 1; then
-
-    :
-fi
- if test "$DX_FLAG_chi" = 1; then
-  DX_COND_chi_TRUE=
-  DX_COND_chi_FALSE='#'
-else
-  DX_COND_chi_TRUE='#'
-  DX_COND_chi_FALSE=
 fi
-
-if test "$DX_FLAG_chi" = 1; then
-    DX_ENV="$DX_ENV GENERATE_CHI='YES'"
-
-    :
-else
-    DX_ENV="$DX_ENV GENERATE_CHI='NO'"
-
-    :
-fi
-
-
-# Plain HTML pages generation:
-
-
-
-    # Check whether --enable-doxygen-html was given.
-if test "${enable_doxygen_html+set}" = set; then :
-  enableval=$enable_doxygen_html;
-case "$enableval" in
-#(
-y|Y|yes|Yes|YES)
-    DX_FLAG_html=1
-
-
-test "$DX_FLAG_doc" = "1" \
-|| as_fn_error $? "doxygen-html requires doxygen-html" "$LINENO" 5
-
-test "$DX_FLAG_chm" = "0" \
-|| as_fn_error $? "doxygen-html contradicts doxygen-html" "$LINENO" 5
-
-;; #(
-n|N|no|No|NO)
-    DX_FLAG_html=0
-
-;; #(
-*)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-html" "$LINENO" 5
-;;
-esac
-
+DX_DVIPS=$ac_cv_path_DX_DVIPS
+if test -n "$DX_DVIPS"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_DVIPS" >&5
+$as_echo "$DX_DVIPS" >&6; }
 else
-
-DX_FLAG_html=1
-
-
-test "$DX_FLAG_doc" = "1" || DX_FLAG_html=0
-
-
-test "$DX_FLAG_chm" = "0" || DX_FLAG_html=0
-
-
-
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
-if test "$DX_FLAG_html" = 1; then
 
-    :
-fi
- if test "$DX_FLAG_html" = 1; then
-  DX_COND_html_TRUE=
-  DX_COND_html_FALSE='#'
-else
-  DX_COND_html_TRUE='#'
-  DX_COND_html_FALSE=
 fi
-
-if test "$DX_FLAG_html" = 1; then
-    DX_ENV="$DX_ENV GENERATE_HTML='YES'"
-
-    :
+if test -z "$ac_cv_path_DX_DVIPS"; then
+  ac_pt_DX_DVIPS=$DX_DVIPS
+  # Extract the first word of "dvips", so it can be a program name with args.
+set dummy dvips; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_ac_pt_DX_DVIPS+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-    test "$DX_FLAG_chm" = 1 || DX_ENV="$DX_ENV GENERATE_HTML='NO'"
-
-    :
-fi
-
-
-# PostScript file generation:
-
-
-
-    # Check whether --enable-doxygen-ps was given.
-if test "${enable_doxygen_ps+set}" = set; then :
-  enableval=$enable_doxygen_ps;
-case "$enableval" in
-#(
-y|Y|yes|Yes|YES)
-    DX_FLAG_ps=1
-
-
-test "$DX_FLAG_doc" = "1" \
-|| as_fn_error $? "doxygen-ps requires doxygen-ps" "$LINENO" 5
-
-;; #(
-n|N|no|No|NO)
-    DX_FLAG_ps=0
+  case $ac_pt_DX_DVIPS in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_ac_pt_DX_DVIPS="$ac_pt_DX_DVIPS" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_ac_pt_DX_DVIPS="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
-;; #(
-*)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-ps" "$LINENO" 5
-;;
+  ;;
 esac
-
+fi
+ac_pt_DX_DVIPS=$ac_cv_path_ac_pt_DX_DVIPS
+if test -n "$ac_pt_DX_DVIPS"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_DVIPS" >&5
+$as_echo "$ac_pt_DX_DVIPS" >&6; }
 else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
 
-DX_FLAG_ps=0
-
-
-test "$DX_FLAG_doc" = "1" || DX_FLAG_ps=0
-
+  if test "x$ac_pt_DX_DVIPS" = x; then
+    DX_DVIPS=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    DX_DVIPS=$ac_pt_DX_DVIPS
+  fi
+else
+  DX_DVIPS="$ac_cv_path_DX_DVIPS"
+fi
 
+if test "$DX_FLAG_ps$DX_DVIPS" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: dvips not found - will not generate doxygen PostScript documentation" >&5
+$as_echo "$as_me: WARNING: dvips not found - will not generate doxygen PostScript documentation" >&2;}
+    DX_FLAG_ps=0
 
 fi
 
-if test "$DX_FLAG_ps" = 1; then
 
 if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}latex", so it can be a program name with args.
-set dummy ${ac_tool_prefix}latex; ac_word=$2
+  # Extract the first word of "${ac_tool_prefix}egrep", so it can be a program name with args.
+set dummy ${ac_tool_prefix}egrep; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_LATEX+:} false; then :
+if ${ac_cv_path_DX_EGREP+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $DX_LATEX in
+  case $DX_EGREP in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_LATEX="$DX_LATEX" # Let the user override the test with a path.
+  ac_cv_path_DX_EGREP="$DX_EGREP" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21769,7 +24612,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_LATEX="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_DX_EGREP="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21780,10 +24623,10 @@
   ;;
 esac
 fi
-DX_LATEX=$ac_cv_path_DX_LATEX
-if test -n "$DX_LATEX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_LATEX" >&5
-$as_echo "$DX_LATEX" >&6; }
+DX_EGREP=$ac_cv_path_DX_EGREP
+if test -n "$DX_EGREP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_EGREP" >&5
+$as_echo "$DX_EGREP" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
@@ -21791,18 +24634,18 @@
 
 
 fi
-if test -z "$ac_cv_path_DX_LATEX"; then
-  ac_pt_DX_LATEX=$DX_LATEX
-  # Extract the first word of "latex", so it can be a program name with args.
-set dummy latex; ac_word=$2
+if test -z "$ac_cv_path_DX_EGREP"; then
+  ac_pt_DX_EGREP=$DX_EGREP
+  # Extract the first word of "egrep", so it can be a program name with args.
+set dummy egrep; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_LATEX+:} false; then :
+if ${ac_cv_path_ac_pt_DX_EGREP+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_LATEX in
+  case $ac_pt_DX_EGREP in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_LATEX="$ac_pt_DX_LATEX" # Let the user override the test with a path.
+  ac_cv_path_ac_pt_DX_EGREP="$ac_pt_DX_EGREP" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21812,7 +24655,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_LATEX="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_ac_pt_DX_EGREP="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21823,17 +24666,17 @@
   ;;
 esac
 fi
-ac_pt_DX_LATEX=$ac_cv_path_ac_pt_DX_LATEX
-if test -n "$ac_pt_DX_LATEX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_LATEX" >&5
-$as_echo "$ac_pt_DX_LATEX" >&6; }
+ac_pt_DX_EGREP=$ac_cv_path_ac_pt_DX_EGREP
+if test -n "$ac_pt_DX_EGREP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_EGREP" >&5
+$as_echo "$ac_pt_DX_EGREP" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_LATEX" = x; then
-    DX_LATEX=""
+  if test "x$ac_pt_DX_EGREP" = x; then
+    DX_EGREP=""
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -21841,31 +24684,88 @@
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    DX_LATEX=$ac_pt_DX_LATEX
+    DX_EGREP=$ac_pt_DX_EGREP
   fi
 else
-  DX_LATEX="$ac_cv_path_DX_LATEX"
+  DX_EGREP="$ac_cv_path_DX_EGREP"
 fi
 
-if test "$DX_FLAG_ps$DX_LATEX" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: latex not found - will not generate doxygen PostScript documentation" >&5
-$as_echo "$as_me: WARNING: latex not found - will not generate doxygen PostScript documentation" >&2;}
+if test "$DX_FLAG_ps$DX_EGREP" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: egrep not found - will not generate doxygen PostScript documentation" >&5
+$as_echo "$as_me: WARNING: egrep not found - will not generate doxygen PostScript documentation" >&2;}
     DX_FLAG_ps=0
 
 fi
 
+    :
+fi
+ if test "$DX_FLAG_ps" = 1; then
+  DX_COND_ps_TRUE=
+  DX_COND_ps_FALSE='#'
+else
+  DX_COND_ps_TRUE='#'
+  DX_COND_ps_FALSE=
+fi
+
+if test "$DX_FLAG_ps" = 1; then
+
+    :
+else
+
+    :
+fi
+
+
+# PDF file generation:
+
+
+
+    # Check whether --enable-doxygen-pdf was given.
+if test "${enable_doxygen_pdf+set}" = set; then :
+  enableval=$enable_doxygen_pdf;
+case "$enableval" in
+#(
+y|Y|yes|Yes|YES)
+    DX_FLAG_pdf=1
+
+
+test "$DX_FLAG_doc" = "1" \
+|| as_fn_error $? "doxygen-pdf requires doxygen-pdf" "$LINENO" 5
+
+;; #(
+n|N|no|No|NO)
+    DX_FLAG_pdf=0
+
+;; #(
+*)
+    as_fn_error $? "invalid value '$enableval' given to doxygen-pdf" "$LINENO" 5
+;;
+esac
+
+else
+
+DX_FLAG_pdf=0
+
+
+test "$DX_FLAG_doc" = "1" || DX_FLAG_pdf=0
+
+
+
+fi
+
+if test "$DX_FLAG_pdf" = 1; then
 
 if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}makeindex", so it can be a program name with args.
-set dummy ${ac_tool_prefix}makeindex; ac_word=$2
+  # Extract the first word of "${ac_tool_prefix}pdflatex", so it can be a program name with args.
+set dummy ${ac_tool_prefix}pdflatex; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_MAKEINDEX+:} false; then :
+if ${ac_cv_path_DX_PDFLATEX+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $DX_MAKEINDEX in
+  case $DX_PDFLATEX in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_MAKEINDEX="$DX_MAKEINDEX" # Let the user override the test with a path.
+  ac_cv_path_DX_PDFLATEX="$DX_PDFLATEX" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21875,7 +24775,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_MAKEINDEX="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_DX_PDFLATEX="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21886,10 +24786,10 @@
   ;;
 esac
 fi
-DX_MAKEINDEX=$ac_cv_path_DX_MAKEINDEX
-if test -n "$DX_MAKEINDEX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_MAKEINDEX" >&5
-$as_echo "$DX_MAKEINDEX" >&6; }
+DX_PDFLATEX=$ac_cv_path_DX_PDFLATEX
+if test -n "$DX_PDFLATEX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_PDFLATEX" >&5
+$as_echo "$DX_PDFLATEX" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
@@ -21897,18 +24797,18 @@
 
 
 fi
-if test -z "$ac_cv_path_DX_MAKEINDEX"; then
-  ac_pt_DX_MAKEINDEX=$DX_MAKEINDEX
-  # Extract the first word of "makeindex", so it can be a program name with args.
-set dummy makeindex; ac_word=$2
+if test -z "$ac_cv_path_DX_PDFLATEX"; then
+  ac_pt_DX_PDFLATEX=$DX_PDFLATEX
+  # Extract the first word of "pdflatex", so it can be a program name with args.
+set dummy pdflatex; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_MAKEINDEX+:} false; then :
+if ${ac_cv_path_ac_pt_DX_PDFLATEX+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_MAKEINDEX in
+  case $ac_pt_DX_PDFLATEX in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_MAKEINDEX="$ac_pt_DX_MAKEINDEX" # Let the user override the test with a path.
+  ac_cv_path_ac_pt_DX_PDFLATEX="$ac_pt_DX_PDFLATEX" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21918,7 +24818,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_MAKEINDEX="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_ac_pt_DX_PDFLATEX="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21929,17 +24829,17 @@
   ;;
 esac
 fi
-ac_pt_DX_MAKEINDEX=$ac_cv_path_ac_pt_DX_MAKEINDEX
-if test -n "$ac_pt_DX_MAKEINDEX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_MAKEINDEX" >&5
-$as_echo "$ac_pt_DX_MAKEINDEX" >&6; }
+ac_pt_DX_PDFLATEX=$ac_cv_path_ac_pt_DX_PDFLATEX
+if test -n "$ac_pt_DX_PDFLATEX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_PDFLATEX" >&5
+$as_echo "$ac_pt_DX_PDFLATEX" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_MAKEINDEX" = x; then
-    DX_MAKEINDEX=""
+  if test "x$ac_pt_DX_PDFLATEX" = x; then
+    DX_PDFLATEX=""
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -21947,31 +24847,31 @@
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    DX_MAKEINDEX=$ac_pt_DX_MAKEINDEX
+    DX_PDFLATEX=$ac_pt_DX_PDFLATEX
   fi
 else
-  DX_MAKEINDEX="$ac_cv_path_DX_MAKEINDEX"
+  DX_PDFLATEX="$ac_cv_path_DX_PDFLATEX"
 fi
 
-if test "$DX_FLAG_ps$DX_MAKEINDEX" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: makeindex not found - will not generate doxygen PostScript documentation" >&5
-$as_echo "$as_me: WARNING: makeindex not found - will not generate doxygen PostScript documentation" >&2;}
-    DX_FLAG_ps=0
+if test "$DX_FLAG_pdf$DX_PDFLATEX" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: pdflatex not found - will not generate doxygen PDF documentation" >&5
+$as_echo "$as_me: WARNING: pdflatex not found - will not generate doxygen PDF documentation" >&2;}
+    DX_FLAG_pdf=0
 
 fi
 
 
 if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}dvips", so it can be a program name with args.
-set dummy ${ac_tool_prefix}dvips; ac_word=$2
+  # Extract the first word of "${ac_tool_prefix}makeindex", so it can be a program name with args.
+set dummy ${ac_tool_prefix}makeindex; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_DVIPS+:} false; then :
+if ${ac_cv_path_DX_MAKEINDEX+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $DX_DVIPS in
+  case $DX_MAKEINDEX in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_DVIPS="$DX_DVIPS" # Let the user override the test with a path.
+  ac_cv_path_DX_MAKEINDEX="$DX_MAKEINDEX" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -21981,7 +24881,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_DVIPS="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_DX_MAKEINDEX="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -21992,10 +24892,10 @@
   ;;
 esac
 fi
-DX_DVIPS=$ac_cv_path_DX_DVIPS
-if test -n "$DX_DVIPS"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_DVIPS" >&5
-$as_echo "$DX_DVIPS" >&6; }
+DX_MAKEINDEX=$ac_cv_path_DX_MAKEINDEX
+if test -n "$DX_MAKEINDEX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_MAKEINDEX" >&5
+$as_echo "$DX_MAKEINDEX" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
@@ -22003,18 +24903,18 @@
 
 
 fi
-if test -z "$ac_cv_path_DX_DVIPS"; then
-  ac_pt_DX_DVIPS=$DX_DVIPS
-  # Extract the first word of "dvips", so it can be a program name with args.
-set dummy dvips; ac_word=$2
+if test -z "$ac_cv_path_DX_MAKEINDEX"; then
+  ac_pt_DX_MAKEINDEX=$DX_MAKEINDEX
+  # Extract the first word of "makeindex", so it can be a program name with args.
+set dummy makeindex; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_DVIPS+:} false; then :
+if ${ac_cv_path_ac_pt_DX_MAKEINDEX+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_DVIPS in
+  case $ac_pt_DX_MAKEINDEX in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_DVIPS="$ac_pt_DX_DVIPS" # Let the user override the test with a path.
+  ac_cv_path_ac_pt_DX_MAKEINDEX="$ac_pt_DX_MAKEINDEX" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -22024,7 +24924,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_DVIPS="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_ac_pt_DX_MAKEINDEX="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -22035,17 +24935,17 @@
   ;;
 esac
 fi
-ac_pt_DX_DVIPS=$ac_cv_path_ac_pt_DX_DVIPS
-if test -n "$ac_pt_DX_DVIPS"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_DVIPS" >&5
-$as_echo "$ac_pt_DX_DVIPS" >&6; }
+ac_pt_DX_MAKEINDEX=$ac_cv_path_ac_pt_DX_MAKEINDEX
+if test -n "$ac_pt_DX_MAKEINDEX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_MAKEINDEX" >&5
+$as_echo "$ac_pt_DX_MAKEINDEX" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_DVIPS" = x; then
-    DX_DVIPS=""
+  if test "x$ac_pt_DX_MAKEINDEX" = x; then
+    DX_MAKEINDEX=""
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -22053,16 +24953,16 @@
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    DX_DVIPS=$ac_pt_DX_DVIPS
+    DX_MAKEINDEX=$ac_pt_DX_MAKEINDEX
   fi
 else
-  DX_DVIPS="$ac_cv_path_DX_DVIPS"
+  DX_MAKEINDEX="$ac_cv_path_DX_MAKEINDEX"
 fi
 
-if test "$DX_FLAG_ps$DX_DVIPS" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: dvips not found - will not generate doxygen PostScript documentation" >&5
-$as_echo "$as_me: WARNING: dvips not found - will not generate doxygen PostScript documentation" >&2;}
-    DX_FLAG_ps=0
+if test "$DX_FLAG_pdf$DX_MAKEINDEX" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: makeindex not found - will not generate doxygen PDF documentation" >&5
+$as_echo "$as_me: WARNING: makeindex not found - will not generate doxygen PDF documentation" >&2;}
+    DX_FLAG_pdf=0
 
 fi
 
@@ -22162,128 +25062,662 @@
     DX_EGREP=$ac_pt_DX_EGREP
   fi
 else
-  DX_EGREP="$ac_cv_path_DX_EGREP"
+  DX_EGREP="$ac_cv_path_DX_EGREP"
+fi
+
+if test "$DX_FLAG_pdf$DX_EGREP" = 1; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: egrep not found - will not generate doxygen PDF documentation" >&5
+$as_echo "$as_me: WARNING: egrep not found - will not generate doxygen PDF documentation" >&2;}
+    DX_FLAG_pdf=0
+
+fi
+
+    :
+fi
+ if test "$DX_FLAG_pdf" = 1; then
+  DX_COND_pdf_TRUE=
+  DX_COND_pdf_FALSE='#'
+else
+  DX_COND_pdf_TRUE='#'
+  DX_COND_pdf_FALSE=
+fi
+
+if test "$DX_FLAG_pdf" = 1; then
+
+    :
+else
+
+    :
+fi
+
+
+# LaTeX generation for PS and/or PDF:
+ if test "$DX_FLAG_ps" = 1 || test "$DX_FLAG_pdf" = 1; then
+  DX_COND_latex_TRUE=
+  DX_COND_latex_FALSE='#'
+else
+  DX_COND_latex_TRUE='#'
+  DX_COND_latex_FALSE=
+fi
+
+if test "$DX_FLAG_ps" = 1 || test "$DX_FLAG_pdf" = 1; then
+    DX_ENV="$DX_ENV GENERATE_LATEX='YES'"
+
+else
+    DX_ENV="$DX_ENV GENERATE_LATEX='NO'"
+
+fi
+
+# Paper size for PS and/or PDF:
+
+case "$DOXYGEN_PAPER_SIZE" in
+#(
+"")
+    DOXYGEN_PAPER_SIZE=""
+
+;; #(
+a4wide|a4|letter|legal|executive)
+    DX_ENV="$DX_ENV PAPER_SIZE='$DOXYGEN_PAPER_SIZE'"
+
+;; #(
+*)
+    as_fn_error $? "unknown DOXYGEN_PAPER_SIZE='$DOXYGEN_PAPER_SIZE'" "$LINENO" 5
+;;
+esac
+
+#For debugging:
+#echo DX_FLAG_doc=$DX_FLAG_doc
+#echo DX_FLAG_dot=$DX_FLAG_dot
+#echo DX_FLAG_man=$DX_FLAG_man
+#echo DX_FLAG_html=$DX_FLAG_html
+#echo DX_FLAG_chm=$DX_FLAG_chm
+#echo DX_FLAG_chi=$DX_FLAG_chi
+#echo DX_FLAG_rtf=$DX_FLAG_rtf
+#echo DX_FLAG_xml=$DX_FLAG_xml
+#echo DX_FLAG_pdf=$DX_FLAG_pdf
+#echo DX_FLAG_ps=$DX_FLAG_ps
+#echo DX_ENV=$DX_ENV
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether assumed size Fortran arrays should be used" >&5
+$as_echo_n "checking whether assumed size Fortran arrays should be used... " >&6; }
+# Check whether --enable-assumed-size was given.
+if test "${enable_assumed_size+set}" = set; then :
+  enableval=$enable_assumed_size; if test x"$enableval" = x"yes"; then
+                                  USE_ASSUMED_SIZE=yes
+                                else
+                                  USE_ASSUMED_SIZE=no
+                                fi
+else
+  USE_ASSUMED_SIZE=yes
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${USE_ASSUMED_SIZE}" >&5
+$as_echo "${USE_ASSUMED_SIZE}" >&6; }
+ if test x"$USE_ASSUMED_SIZE" = x"yes"; then
+  WITH_USE_ASSUMED_SIZE_TRUE=
+  WITH_USE_ASSUMED_SIZE_FALSE='#'
+else
+  WITH_USE_ASSUMED_SIZE_TRUE='#'
+  WITH_USE_ASSUMED_SIZE_FALSE=
+fi
+
+if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
+
+$as_echo "#define USE_ASSUMED_SIZE 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether Fortran2008 features should be enabled" >&5
+$as_echo_n "checking whether Fortran2008 features should be enabled... " >&6; }
+# Check whether --enable-Fortran2008-features was given.
+if test "${enable_Fortran2008_features+set}" = set; then :
+  enableval=$enable_Fortran2008_features;
+			      if test x"$enableval" = x"yes"; then
+			        enable_fortran2008_features=yes
+		              else
+			        enable_fortran2008_features=no
+			      fi
+
+else
+  enable_fortran2008_features=yes
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_fortran2008_features}" >&5
+$as_echo "${enable_fortran2008_features}" >&6; }
+ if test x"$enable_fortran2008_features" = x"yes"; then
+  USE_FORTRAN2008_TRUE=
+  USE_FORTRAN2008_FALSE='#'
+else
+  USE_FORTRAN2008_TRUE='#'
+  USE_FORTRAN2008_FALSE=
+fi
+
+if test x"${enable_fortran2008_features}" = x"yes"; then
+
+$as_echo "#define USE_FORTRAN2008 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether autotuning functionality should be enabled" >&5
+$as_echo_n "checking whether autotuning functionality should be enabled... " >&6; }
+# Check whether --enable-autotuning was given.
+if test "${enable_autotuning+set}" = set; then :
+  enableval=$enable_autotuning;
+			      if test x"$enableval" = x"yes"; then
+			        enable_autotuning=yes
+		              else
+			        enable_autotuning=no
+			      fi
+
+else
+  enable_autotuning=yes
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_autotuning}" >&5
+$as_echo "${enable_autotuning}" >&6; }
+ if test x"$enable_autotuning" = x"yes"; then
+  ENABLE_AUTOTUNING_TRUE=
+  ENABLE_AUTOTUNING_FALSE='#'
+else
+  ENABLE_AUTOTUNING_TRUE='#'
+  ENABLE_AUTOTUNING_FALSE=
+fi
+
+if test x"${enable_autotuning}" = x"yes"; then
+
+$as_echo "#define ENABLE_AUTOTUNING 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C tests should be provided" >&5
+$as_echo_n "checking whether C tests should be provided... " >&6; }
+# Check whether --enable-c-tests was given.
+if test "${enable_c_tests+set}" = set; then :
+  enableval=$enable_c_tests;
+			      if test x"$enableval" = x"yes"; then
+			        enable_c_tests=yes
+		              else
+			        enable_c_tests=no
+			      fi
+
+else
+  enable_c_tests=yes
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_c_tests}" >&5
+$as_echo "${enable_c_tests}" >&6; }
+ if test x"$enable_c_tests" = x"yes"; then
+  ENABLE_C_TESTS_TRUE=
+  ENABLE_C_TESTS_FALSE='#'
+else
+  ENABLE_C_TESTS_TRUE='#'
+  ENABLE_C_TESTS_FALSE=
+fi
+
+if test x"${enable_c_tests}" = x"yes"; then
+
+$as_echo "#define ENABLE_C_TESTS 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we build for K-computer" >&5
+$as_echo_n "checking whether we build for K-computer... " >&6; }
+# Check whether --enable-K-computer was given.
+if test "${enable_K_computer+set}" = set; then :
+  enableval=$enable_K_computer; if test x"$enableval" = x"yes"; then
+			        enable_kcomputer=yes
+		              else
+				enable_kcomputer=no
+			      fi
+else
+  enable_kcomputer=no
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_kcomputer}" >&5
+$as_echo "${enable_kcomputer}" >&6; }
+ if test x"$enable_kcomputer" = x"yes"; then
+  BUILD_KCOMPUTER_TRUE=
+  BUILD_KCOMPUTER_FALSE='#'
+else
+  BUILD_KCOMPUTER_TRUE='#'
+  BUILD_KCOMPUTER_FALSE=
+fi
+
+if test x"${enable_kcomputer}" = x"yes"; then
+
+$as_echo "#define BUILD_KCOMPUTER 1" >>confdefs.h
+
+ FC_MODINC="-I"
+  if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
+    as_fn_error $? "on K-computer you have to switch off assumed-size arrays!" "$LINENO" 5
+  fi
+  if test x"${enable_fortran2008_features}" = x"yes" ; then
+    as_fn_error $? "on K-computer you have to switch off Fortran 2008 features!" "$LINENO" 5
+  fi
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we build for NEC SX-Auroa" >&5
+$as_echo_n "checking whether we build for NEC SX-Auroa... " >&6; }
+# Check whether --enable-SX-Aurora was given.
+if test "${enable_SX_Aurora+set}" = set; then :
+  enableval=$enable_SX_Aurora; if test x"$enableval" = x"yes"; then
+			        enable_sxaurora=yes
+		              else
+				enable_sxaurora=no
+			      fi
+else
+  enable_kcomputer=no
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_sxaurora}" >&5
+$as_echo "${enable_sxaurora}" >&6; }
+ if test x"$enable_sxaurora" = x"yes"; then
+  BUILD_KCOMPUTER_TRUE=
+  BUILD_KCOMPUTER_FALSE='#'
+else
+  BUILD_KCOMPUTER_TRUE='#'
+  BUILD_KCOMPUTER_FALSE=
+fi
+
+if test x"${enable_sxaurora}" = x"yes"; then
+
+$as_echo "#define BUILD_SXAURORA 1" >>confdefs.h
+
+ FC_MODINC="-I"
+  #if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
+  #  AC_MSG_ERROR(on K-computer you have to switch off assumed-size arrays!)
+  #fi
+  if test x"${enable_fortran2008_features}" = x"yes" ; then
+    as_fn_error $? "on SX-Aurora you have to switch off Fortran 2008 features!" "$LINENO" 5
+  fi
+fi
+
+if test x"${want_single_precision}" = x"yes" ; then
+
+$as_echo "#define WANT_SINGLE_PRECISION_REAL 1" >>confdefs.h
+
+
+$as_echo "#define WANT_SINGLE_PRECISION_COMPLEX 1" >>confdefs.h
+
+fi
+ if test x"$want_single_precision" = x"yes"; then
+  WANT_SINGLE_PRECISION_REAL_TRUE=
+  WANT_SINGLE_PRECISION_REAL_FALSE='#'
+else
+  WANT_SINGLE_PRECISION_REAL_TRUE='#'
+  WANT_SINGLE_PRECISION_REAL_FALSE=
+fi
+
+ if test x"$want_single_precision" = x"yes"; then
+  WANT_SINGLE_PRECISION_COMPLEX_TRUE=
+  WANT_SINGLE_PRECISION_COMPLEX_FALSE='#'
+else
+  WANT_SINGLE_PRECISION_COMPLEX_TRUE='#'
+  WANT_SINGLE_PRECISION_COMPLEX_FALSE=
+fi
+
+
+#always define SKEWSYMMETRIC for the moment
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we should enable skew-symmetric support" >&5
+$as_echo_n "checking whether we should enable skew-symmetric support... " >&6; }
+# Check whether --enable-skew-symmetric-support was given.
+if test "${enable_skew_symmetric_support+set}" = set; then :
+  enableval=$enable_skew_symmetric_support; if test x"$enableval" = x"yes"; then
+			        enable_skewsymmetric=yes
+		              else
+				enable_skewsymmetric=no
+			      fi
+else
+  enable_skewsymmetric=no
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_skewsymmetric}" >&5
+$as_echo "${enable_skewsymmetric}" >&6; }
+ if test x"$enable_skewsymmetric" = x"yes"; then
+  HAVE_SKEWSYMMETRIC_TRUE=
+  HAVE_SKEWSYMMETRIC_FALSE='#'
+else
+  HAVE_SKEWSYMMETRIC_TRUE='#'
+  HAVE_SKEWSYMMETRIC_FALSE=
+fi
+
+if test x"${enable_skewsymmetric}" = x"yes"; then
+
+$as_echo "#define HAVE_SKEWSYMMETRIC 1" >>confdefs.h
+
+fi
+
+
+
+
+
+
+
+
+
+DOXYGEN_OUTPUT_DIR=docs
+
+
+mkdir -p modules private_modules test_modules
+
+
+#gl_VISIBILITY
+#AH_BOTTOM([#if HAVE_VISIBILITY
+#define EXPORTED __attribute__((__visibility__("default")))
+#define HIDDEN   __attribute__((__visibility__("hidden")))
+#else
+#define EXPORTED
+#define HIDDEN
+#endif])
+
+
+# Some part of libtool is too smart and tries to parse the output of
+#   gfortran -v
+# and catches anything that resembles a -l library link specification.
+# Unfortunately, recent versions of gfortran emit
+#   -l gfortran
+# with a space between -l and gfortran. The empty -l is then included
+# into "postdeps_FC" and causes linking errors later on.
+postdeps_FC=$(echo $postdeps_FC | sed 's/-l //g')
+
+if test x"${with_mpi}" = x"yes"; then
+  if test x"${enable_openmp}" = x"yes"; then
+        SUFFIX="_openmp"
+  else
+        SUFFIX=""
+  fi
+else
+  if test x"${enable_openmp}" = x"yes"; then
+        SUFFIX="_onenode_openmp"
+  else
+        SUFFIX="_onenode"
+  fi
+fi
+
+echo "checking whether build config should be compiled into the library..."
+# Extract the first word of "xxd", so it can be a program name with args.
+set dummy xxd; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_xxd_CHECK+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$xxd_CHECK"; then
+  ac_cv_prog_xxd_CHECK="$xxd_CHECK" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_xxd_CHECK="yes"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+xxd_CHECK=$ac_cv_prog_xxd_CHECK
+if test -n "$xxd_CHECK"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $xxd_CHECK" >&5
+$as_echo "$xxd_CHECK" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
-if test "$DX_FLAG_ps$DX_EGREP" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: egrep not found - will not generate doxygen PostScript documentation" >&5
-$as_echo "$as_me: WARNING: egrep not found - will not generate doxygen PostScript documentation" >&2;}
-    DX_FLAG_ps=0
 
+if test x"$xxd_CHECK" != x"yes"; then :
+  as_fn_error $? "Please install xxd before configuring." "$LINENO" 5
 fi
+# Check whether --enable-store-build-config was given.
+if test "${enable_store_build_config+set}" = set; then :
+  enableval=$enable_store_build_config;
+	       if test x"$enableval" = x"yes"; then
+	         store_build_config=yes
+	       else
+	         store_build_config=no
+	       fi
 
-    :
+else
+  store_build_config=no
 fi
- if test "$DX_FLAG_ps" = 1; then
-  DX_COND_ps_TRUE=
-  DX_COND_ps_FALSE='#'
+
+ if test x"$store_build_config" = x"yes"; then
+  STORE_BUILD_CONFIG_TRUE=
+  STORE_BUILD_CONFIG_FALSE='#'
 else
-  DX_COND_ps_TRUE='#'
-  DX_COND_ps_FALSE=
+  STORE_BUILD_CONFIG_TRUE='#'
+  STORE_BUILD_CONFIG_FALSE=
 fi
 
-if test "$DX_FLAG_ps" = 1; then
+if test x"${store_build_config}" = x"yes"; then
+	echo "build config should be compiled into the library: yes"
+
+$as_echo "#define STORE_BUILD_CONFIG 1" >>confdefs.h
 
-    :
 else
+	echo "build config should be compiled into the library: no"
+fi
 
-    :
+
+
+PKG_CONFIG_FILE=elpa${SUFFIX}-${PACKAGE_VERSION}.pc
+
+
+ac_config_files="$ac_config_files Makefile Doxyfile ${PKG_CONFIG_FILE}:elpa.pc.in elpa/elpa_constants.h elpa/elpa_version.h elpa/elpa_build_config.h"
+
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if workaround for broken preprocessor is needed" >&5
+$as_echo_n "checking if workaround for broken preprocessor is needed... " >&6; }
+
+need_manual_cpp=no
+
+        ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+        cat > conftest.$ac_ext <<_ACEOF
+
+program test_define
+#ifndef __INTEL_COMPILER
+  choke me
+#endif
+end program
+
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  is_intel=yes
 fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+        ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-# PDF file generation:
 
+        ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+        cat > conftest.$ac_ext <<_ACEOF
 
+program test_define
+#ifndef __PGI
+  choke me
+#endif
+end program
 
-    # Check whether --enable-doxygen-pdf was given.
-if test "${enable_doxygen_pdf+set}" = set; then :
-  enableval=$enable_doxygen_pdf;
-case "$enableval" in
-#(
-y|Y|yes|Yes|YES)
-    DX_FLAG_pdf=1
+_ACEOF
+if ac_fn_fc_try_compile "$LINENO"; then :
+  is_pgi=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+        ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-test "$DX_FLAG_doc" = "1" \
-|| as_fn_error $? "doxygen-pdf requires doxygen-pdf" "$LINENO" 5
+ACTUAL_FC="$FC"
 
-;; #(
-n|N|no|No|NO)
-    DX_FLAG_pdf=0
 
-;; #(
-*)
-    as_fn_error $? "invalid value '$enableval' given to doxygen-pdf" "$LINENO" 5
-;;
-esac
+if test x"$is_intel" = x"yes" ; then
+	need_manual_cpp=yes
+fi
+if test x"$is_pgi" = x"yes" ; then
+	need_manual_cpp=yes
+fi
 
+if test x"$need_manual_cpp" = x"yes" ; then
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+        FC="\$(top_srcdir)/manual_cpp $FC"
 else
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
 
-DX_FLAG_pdf=0
-
+if test x"$is_pgi" = x"yes" ; then
 
-test "$DX_FLAG_doc" = "1" || DX_FLAG_pdf=0
+$as_echo "#define PGI_VARIABLE_STRING_BUG 1" >>confdefs.h
 
+fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we have to escape '-module' for libtool" >&5
+$as_echo_n "checking whether we have to escape '-module' for libtool... " >&6; }
+if test x"$FC_MODOUT" = x'-module ' ; then
+        FC_MODOUT="-Xcompiler $FC_MODOUT -Xcompiler \$(ac_empty)"
+        FC="\$(top_srcdir)/remove_xcompiler $FC"
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
-if test "$DX_FLAG_pdf" = 1; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether --enable-python is specified" >&5
+$as_echo_n "checking whether --enable-python is specified... " >&6; }
+# Check whether --enable-python was given.
+if test "${enable_python+set}" = set; then :
+  enableval=$enable_python;
+	       if test x"$enableval" = x"yes"; then
+	         enable_python=yes
+	       else
+	         enable_python=no
+	       fi
 
-if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}pdflatex", so it can be a program name with args.
-set dummy ${ac_tool_prefix}pdflatex; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_PDFLATEX+:} false; then :
-  $as_echo_n "(cached) " >&6
 else
-  case $DX_PDFLATEX in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_PDFLATEX="$DX_PDFLATEX" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_PDFLATEX="$as_dir/$ac_word$ac_exec_ext"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
+  enable_python=no
+fi
 
-  ;;
-esac
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_python}" >&5
+$as_echo "${enable_python}" >&6; }
+ if test x"$enable_python" = x"yes"; then
+  WITH_PYTHON_TRUE=
+  WITH_PYTHON_FALSE='#'
+else
+  WITH_PYTHON_TRUE='#'
+  WITH_PYTHON_FALSE=
 fi
-DX_PDFLATEX=$ac_cv_path_DX_PDFLATEX
-if test -n "$DX_PDFLATEX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_PDFLATEX" >&5
-$as_echo "$DX_PDFLATEX" >&6; }
+
+if test x"${enable_python}" = x"yes"; then
+
+$as_echo "#define WITH_PYTHON 1" >>confdefs.h
+
+        # check for python and dependencies
+
+
+
+
+
+
+        if test -n "$PYTHON"; then
+      # If the user set $PYTHON, use it and don't search something else.
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $PYTHON version is >= 3.6" >&5
+$as_echo_n "checking whether $PYTHON version is >= 3.6... " >&6; }
+      prog="import sys
+# split strings by '.' and convert to numeric.  Append some zeros
+# because we need at least 4 digits for the hex conversion.
+# map returns an iterator in Python 3.0 and a list in 2.x
+minver = list(map(int, '3.6'.split('.'))) + [0, 0, 0]
+minverhex = 0
+# xrange is not present in Python 3.0 and range returns an iterator
+for i in list(range(0, 4)): minverhex = (minverhex << 8) + minver[i]
+sys.exit(sys.hexversion < minverhex)"
+  if { echo "$as_me:$LINENO: $PYTHON -c "$prog"" >&5
+   ($PYTHON -c "$prog") >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
+			       as_fn_error $? "Python interpreter is too old" "$LINENO" 5
 fi
+      am_display_PYTHON=$PYTHON
+    else
+      # Otherwise, try each interpreter until we find one that satisfies
+      # VERSION.
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for a Python interpreter with version >= 3.6" >&5
+$as_echo_n "checking for a Python interpreter with version >= 3.6... " >&6; }
+if ${am_cv_pathless_PYTHON+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
 
-
+	for am_cv_pathless_PYTHON in python python2 python3  python3.9 python3.8 python3.7 python3.6 python3.5 python3.4 python3.3  python3.2 python3.1 python3.0  python2.7 python2.6 python2.5 python2.4 python2.3 python2.2 python2.1  python2.0 none; do
+	  test "$am_cv_pathless_PYTHON" = none && break
+	  prog="import sys
+# split strings by '.' and convert to numeric.  Append some zeros
+# because we need at least 4 digits for the hex conversion.
+# map returns an iterator in Python 3.0 and a list in 2.x
+minver = list(map(int, '3.6'.split('.'))) + [0, 0, 0]
+minverhex = 0
+# xrange is not present in Python 3.0 and range returns an iterator
+for i in list(range(0, 4)): minverhex = (minverhex << 8) + minver[i]
+sys.exit(sys.hexversion < minverhex)"
+  if { echo "$as_me:$LINENO: $am_cv_pathless_PYTHON -c "$prog"" >&5
+   ($am_cv_pathless_PYTHON -c "$prog") >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }; then :
+  break
 fi
-if test -z "$ac_cv_path_DX_PDFLATEX"; then
-  ac_pt_DX_PDFLATEX=$DX_PDFLATEX
-  # Extract the first word of "pdflatex", so it can be a program name with args.
-set dummy pdflatex; ac_word=$2
+	done
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_pathless_PYTHON" >&5
+$as_echo "$am_cv_pathless_PYTHON" >&6; }
+      # Set $PYTHON to the absolute path of $am_cv_pathless_PYTHON.
+      if test "$am_cv_pathless_PYTHON" = none; then
+	PYTHON=:
+      else
+        # Extract the first word of "$am_cv_pathless_PYTHON", so it can be a program name with args.
+set dummy $am_cv_pathless_PYTHON; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_PDFLATEX+:} false; then :
+if ${ac_cv_path_PYTHON+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_PDFLATEX in
+  case $PYTHON in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_PDFLATEX="$ac_pt_DX_PDFLATEX" # Let the user override the test with a path.
+  ac_cv_path_PYTHON="$PYTHON" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -22293,7 +25727,7 @@
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_PDFLATEX="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_PYTHON="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -22304,102 +25738,198 @@
   ;;
 esac
 fi
-ac_pt_DX_PDFLATEX=$ac_cv_path_ac_pt_DX_PDFLATEX
-if test -n "$ac_pt_DX_PDFLATEX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_PDFLATEX" >&5
-$as_echo "$ac_pt_DX_PDFLATEX" >&6; }
+PYTHON=$ac_cv_path_PYTHON
+if test -n "$PYTHON"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PYTHON" >&5
+$as_echo "$PYTHON" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_PDFLATEX" = x; then
-    DX_PDFLATEX=""
+
+      fi
+      am_display_PYTHON=$am_cv_pathless_PYTHON
+    fi
+
+
+  if test "$PYTHON" = :; then
+      as_fn_error $? "no suitable Python interpreter found" "$LINENO" 5
   else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    DX_PDFLATEX=$ac_pt_DX_PDFLATEX
-  fi
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $am_display_PYTHON version" >&5
+$as_echo_n "checking for $am_display_PYTHON version... " >&6; }
+if ${am_cv_python_version+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-  DX_PDFLATEX="$ac_cv_path_DX_PDFLATEX"
+  am_cv_python_version=`$PYTHON -c "import sys; sys.stdout.write(sys.version[:3])"`
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_python_version" >&5
+$as_echo "$am_cv_python_version" >&6; }
+  PYTHON_VERSION=$am_cv_python_version
 
-if test "$DX_FLAG_pdf$DX_PDFLATEX" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: pdflatex not found - will not generate doxygen PDF documentation" >&5
-$as_echo "$as_me: WARNING: pdflatex not found - will not generate doxygen PDF documentation" >&2;}
-    DX_FLAG_pdf=0
 
+
+  PYTHON_PREFIX='${prefix}'
+
+  PYTHON_EXEC_PREFIX='${exec_prefix}'
+
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $am_display_PYTHON platform" >&5
+$as_echo_n "checking for $am_display_PYTHON platform... " >&6; }
+if ${am_cv_python_platform+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  am_cv_python_platform=`$PYTHON -c "import sys; sys.stdout.write(sys.platform)"`
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_python_platform" >&5
+$as_echo "$am_cv_python_platform" >&6; }
+  PYTHON_PLATFORM=$am_cv_python_platform
 
 
-if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}makeindex", so it can be a program name with args.
-set dummy ${ac_tool_prefix}makeindex; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_MAKEINDEX+:} false; then :
+  # Just factor out some code duplication.
+  am_python_setup_sysconfig="\
+import sys
+# Prefer sysconfig over distutils.sysconfig, for better compatibility
+# with python 3.x.  See automake bug#10227.
+try:
+    import sysconfig
+except ImportError:
+    can_use_sysconfig = 0
+else:
+    can_use_sysconfig = 1
+# Can't use sysconfig in CPython 2.7, since it's broken in virtualenvs:
+# <https://github.com/pypa/virtualenv/issues/118>
+try:
+    from platform import python_implementation
+    if python_implementation() == 'CPython' and sys.version[:3] == '2.7':
+        can_use_sysconfig = 0
+except ImportError:
+    pass"
+
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $am_display_PYTHON script directory" >&5
+$as_echo_n "checking for $am_display_PYTHON script directory... " >&6; }
+if ${am_cv_python_pythondir+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $DX_MAKEINDEX in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_MAKEINDEX="$DX_MAKEINDEX" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_MAKEINDEX="$as_dir/$ac_word$ac_exec_ext"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
+  if test "x$prefix" = xNONE
+     then
+       am_py_prefix=$ac_default_prefix
+     else
+       am_py_prefix=$prefix
+     fi
+     am_cv_python_pythondir=`$PYTHON -c "
+$am_python_setup_sysconfig
+if can_use_sysconfig:
+    sitedir = sysconfig.get_path('purelib', vars={'base':'$am_py_prefix'})
+else:
+    from distutils import sysconfig
+    sitedir = sysconfig.get_python_lib(0, 0, prefix='$am_py_prefix')
+sys.stdout.write(sitedir)"`
+     case $am_cv_python_pythondir in
+     $am_py_prefix*)
+       am__strip_prefix=`echo "$am_py_prefix" | sed 's|.|.|g'`
+       am_cv_python_pythondir=`echo "$am_cv_python_pythondir" | sed "s,^$am__strip_prefix,$PYTHON_PREFIX,"`
+       ;;
+     *)
+       case $am_py_prefix in
+         /usr|/System*) ;;
+         *)
+	  am_cv_python_pythondir=$PYTHON_PREFIX/lib/python$PYTHON_VERSION/site-packages
+	  ;;
+       esac
+       ;;
+     esac
 
-  ;;
-esac
 fi
-DX_MAKEINDEX=$ac_cv_path_DX_MAKEINDEX
-if test -n "$DX_MAKEINDEX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_MAKEINDEX" >&5
-$as_echo "$DX_MAKEINDEX" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_python_pythondir" >&5
+$as_echo "$am_cv_python_pythondir" >&6; }
+  pythondir=$am_cv_python_pythondir
+
+
+
+  pkgpythondir=\${pythondir}/$PACKAGE
+
+
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $am_display_PYTHON extension module directory" >&5
+$as_echo_n "checking for $am_display_PYTHON extension module directory... " >&6; }
+if ${am_cv_python_pyexecdir+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  if test "x$exec_prefix" = xNONE
+     then
+       am_py_exec_prefix=$am_py_prefix
+     else
+       am_py_exec_prefix=$exec_prefix
+     fi
+     am_cv_python_pyexecdir=`$PYTHON -c "
+$am_python_setup_sysconfig
+if can_use_sysconfig:
+    sitedir = sysconfig.get_path('platlib', vars={'platbase':'$am_py_prefix'})
+else:
+    from distutils import sysconfig
+    sitedir = sysconfig.get_python_lib(1, 0, prefix='$am_py_prefix')
+sys.stdout.write(sitedir)"`
+     case $am_cv_python_pyexecdir in
+     $am_py_exec_prefix*)
+       am__strip_prefix=`echo "$am_py_exec_prefix" | sed 's|.|.|g'`
+       am_cv_python_pyexecdir=`echo "$am_cv_python_pyexecdir" | sed "s,^$am__strip_prefix,$PYTHON_EXEC_PREFIX,"`
+       ;;
+     *)
+       case $am_py_exec_prefix in
+         /usr|/System*) ;;
+         *)
+	   am_cv_python_pyexecdir=$PYTHON_EXEC_PREFIX/lib/python$PYTHON_VERSION/site-packages
+	   ;;
+       esac
+       ;;
+     esac
+
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_python_pyexecdir" >&5
+$as_echo "$am_cv_python_pyexecdir" >&6; }
+  pyexecdir=$am_cv_python_pyexecdir
 
 
-fi
-if test -z "$ac_cv_path_DX_MAKEINDEX"; then
-  ac_pt_DX_MAKEINDEX=$DX_MAKEINDEX
-  # Extract the first word of "makeindex", so it can be a program name with args.
-set dummy makeindex; ac_word=$2
+
+  pkgpyexecdir=\${pyexecdir}/$PACKAGE
+
+
+
+  fi
+
+
+
+
+        if test -z "$PYTHON_INCLUDE"; then :
+
+          if test -z "$PYTHON_CONFIG"; then :
+
+            for ac_prog in python$PYTHON_VERSION-config python-config
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_MAKEINDEX+:} false; then :
+if ${ac_cv_path_PYTHON_CONFIG+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_MAKEINDEX in
+  case $PYTHON_CONFIG in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_MAKEINDEX="$ac_pt_DX_MAKEINDEX" # Let the user override the test with a path.
+  ac_cv_path_PYTHON_CONFIG="$PYTHON_CONFIG" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
+for as_dir in `dirname $PYTHON`
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_MAKEINDEX="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_PYTHON_CONFIG="$as_dir/$ac_word$ac_exec_ext"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -22410,59 +25940,74 @@
   ;;
 esac
 fi
-ac_pt_DX_MAKEINDEX=$ac_cv_path_ac_pt_DX_MAKEINDEX
-if test -n "$ac_pt_DX_MAKEINDEX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_MAKEINDEX" >&5
-$as_echo "$ac_pt_DX_MAKEINDEX" >&6; }
+PYTHON_CONFIG=$ac_cv_path_PYTHON_CONFIG
+if test -n "$PYTHON_CONFIG"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PYTHON_CONFIG" >&5
+$as_echo "$PYTHON_CONFIG" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_MAKEINDEX" = x; then
-    DX_MAKEINDEX=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    DX_MAKEINDEX=$ac_pt_DX_MAKEINDEX
-  fi
-else
-  DX_MAKEINDEX="$ac_cv_path_DX_MAKEINDEX"
-fi
 
-if test "$DX_FLAG_pdf$DX_MAKEINDEX" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: makeindex not found - will not generate doxygen PDF documentation" >&5
-$as_echo "$as_me: WARNING: makeindex not found - will not generate doxygen PDF documentation" >&2;}
-    DX_FLAG_pdf=0
+  test -n "$PYTHON_CONFIG" && break
+done
+test -n "$PYTHON_CONFIG" || PYTHON_CONFIG="no"
 
+            if test "$PYTHON_CONFIG" = no; then :
+  as_fn_error $? "cannot find python-config for $PYTHON." "$LINENO" 5
 fi
 
+fi
+          { $as_echo "$as_me:${as_lineno-$LINENO}: checking python include flags" >&5
+$as_echo_n "checking python include flags... " >&6; }
+          PYTHON_INCLUDE=`$PYTHON_CONFIG --includes`
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PYTHON_INCLUDE" >&5
+$as_echo "$PYTHON_INCLUDE" >&6; }
 
-if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}egrep", so it can be a program name with args.
-set dummy ${ac_tool_prefix}egrep; ac_word=$2
+fi
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking numpy module" >&5
+$as_echo_n "checking numpy module... " >&6; }
+        if $PYTHON -c "import numpy"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: found." >&5
+$as_echo "found." >&6; }
+else
+  as_fn_error $? "cannot find numpy." "$LINENO" 5
+fi
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking mpi4py module" >&5
+$as_echo_n "checking mpi4py module... " >&6; }
+        if $PYTHON -c "import mpi4py"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: found." >&5
+$as_echo "found." >&6; }
+else
+  as_fn_error $? "cannot find mpi4py." "$LINENO" 5
+fi
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking cython module" >&5
+$as_echo_n "checking cython module... " >&6; }
+        if $PYTHON -c "import cython"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: found." >&5
+$as_echo "found." >&6; }
+else
+  as_fn_error $? "cannot find cython." "$LINENO" 5
+fi
+        # Extract the first word of "cython", so it can be a program name with args.
+set dummy cython; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_DX_EGREP+:} false; then :
+if ${ac_cv_prog_cython_found+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $DX_EGREP in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_DX_EGREP="$DX_EGREP" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+  if test -n "$cython_found"; then
+  ac_cv_prog_cython_found="$cython_found" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_DX_EGREP="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_prog_cython_found="yes"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -22470,42 +26015,79 @@
   done
 IFS=$as_save_IFS
 
-  ;;
-esac
+  test -z "$ac_cv_prog_cython_found" && ac_cv_prog_cython_found="no"
 fi
-DX_EGREP=$ac_cv_path_DX_EGREP
-if test -n "$DX_EGREP"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $DX_EGREP" >&5
-$as_echo "$DX_EGREP" >&6; }
+fi
+cython_found=$ac_cv_prog_cython_found
+if test -n "$cython_found"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $cython_found" >&5
+$as_echo "$cython_found" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
 
-fi
-if test -z "$ac_cv_path_DX_EGREP"; then
-  ac_pt_DX_EGREP=$DX_EGREP
-  # Extract the first word of "egrep", so it can be a program name with args.
-set dummy egrep; ac_word=$2
+        if test x"$cython_found" != x"yes" ; then
+          as_fn_error $? "cython not found." "$LINENO" 5
+        fi
+
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking numpy include flags" >&5
+$as_echo_n "checking numpy include flags... " >&6; }
+        NUMPY_INCLUDE=-I`$PYTHON -c "import numpy; print(numpy.get_include())"`
+        if test "$NUMPY_INCLUDE" = "-I"; then :
+  as_fn_error $? "cannot get numpy include path." "$LINENO" 5
+fi
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $NUMPY_INCLUDE" >&5
+$as_echo "$NUMPY_INCLUDE" >&6; }
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether --enable-python-tests is specified" >&5
+$as_echo_n "checking whether --enable-python-tests is specified... " >&6; }
+# Check whether --enable-python-tests was given.
+if test "${enable_python_tests+set}" = set; then :
+  enableval=$enable_python_tests;
+	       if test x"$enableval" = x"yes"; then
+	         enable_python_tests=yes
+	       else
+	         enable_python_tests=no
+	       fi
+
+else
+  enable_python_tests=no
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${enable_python_tests}" >&5
+$as_echo "${enable_python_tests}" >&6; }
+ if test x"$enable_python_tests" = x"yes"; then
+  WITH_PYTHON_TESTS_TRUE=
+  WITH_PYTHON_TESTS_FALSE='#'
+else
+  WITH_PYTHON_TESTS_TRUE='#'
+  WITH_PYTHON_TESTS_FALSE=
+fi
+
+if test x"${enable_python_tests}" = x"yes"; then
+        if test x"${enable_python}" = x"no"; then
+          as_fn_error $? "Python tests can only be enabled it python is enabled." "$LINENO" 5
+        fi
+        # Extract the first word of "pytest", so it can be a program name with args.
+set dummy pytest; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ac_pt_DX_EGREP+:} false; then :
+if ${ac_cv_prog_pytest_found+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  case $ac_pt_DX_EGREP in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_ac_pt_DX_EGREP="$ac_pt_DX_EGREP" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+  if test -n "$pytest_found"; then
+  ac_cv_prog_pytest_found="$pytest_found" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_path_ac_pt_DX_EGREP="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_prog_pytest_found="yes"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -22513,160 +26095,23 @@
   done
 IFS=$as_save_IFS
 
-  ;;
-esac
+  test -z "$ac_cv_prog_pytest_found" && ac_cv_prog_pytest_found="no"
 fi
-ac_pt_DX_EGREP=$ac_cv_path_ac_pt_DX_EGREP
-if test -n "$ac_pt_DX_EGREP"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_DX_EGREP" >&5
-$as_echo "$ac_pt_DX_EGREP" >&6; }
+fi
+pytest_found=$ac_cv_prog_pytest_found
+if test -n "$pytest_found"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pytest_found" >&5
+$as_echo "$pytest_found" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_pt_DX_EGREP" = x; then
-    DX_EGREP=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    DX_EGREP=$ac_pt_DX_EGREP
-  fi
-else
-  DX_EGREP="$ac_cv_path_DX_EGREP"
-fi
-
-if test "$DX_FLAG_pdf$DX_EGREP" = 1; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: egrep not found - will not generate doxygen PDF documentation" >&5
-$as_echo "$as_me: WARNING: egrep not found - will not generate doxygen PDF documentation" >&2;}
-    DX_FLAG_pdf=0
-
-fi
-
-    :
-fi
- if test "$DX_FLAG_pdf" = 1; then
-  DX_COND_pdf_TRUE=
-  DX_COND_pdf_FALSE='#'
-else
-  DX_COND_pdf_TRUE='#'
-  DX_COND_pdf_FALSE=
-fi
-
-if test "$DX_FLAG_pdf" = 1; then
-
-    :
-else
-
-    :
-fi
-
 
-# LaTeX generation for PS and/or PDF:
- if test "$DX_FLAG_ps" = 1 || test "$DX_FLAG_pdf" = 1; then
-  DX_COND_latex_TRUE=
-  DX_COND_latex_FALSE='#'
-else
-  DX_COND_latex_TRUE='#'
-  DX_COND_latex_FALSE=
-fi
-
-if test "$DX_FLAG_ps" = 1 || test "$DX_FLAG_pdf" = 1; then
-    DX_ENV="$DX_ENV GENERATE_LATEX='YES'"
-
-else
-    DX_ENV="$DX_ENV GENERATE_LATEX='NO'"
-
-fi
-
-# Paper size for PS and/or PDF:
-
-case "$DOXYGEN_PAPER_SIZE" in
-#(
-"")
-    DOXYGEN_PAPER_SIZE=""
-
-;; #(
-a4wide|a4|letter|legal|executive)
-    DX_ENV="$DX_ENV PAPER_SIZE='$DOXYGEN_PAPER_SIZE'"
-
-;; #(
-*)
-    as_fn_error $? "unknown DOXYGEN_PAPER_SIZE='$DOXYGEN_PAPER_SIZE'" "$LINENO" 5
-;;
-esac
-
-#For debugging:
-#echo DX_FLAG_doc=$DX_FLAG_doc
-#echo DX_FLAG_dot=$DX_FLAG_dot
-#echo DX_FLAG_man=$DX_FLAG_man
-#echo DX_FLAG_html=$DX_FLAG_html
-#echo DX_FLAG_chm=$DX_FLAG_chm
-#echo DX_FLAG_chi=$DX_FLAG_chi
-#echo DX_FLAG_rtf=$DX_FLAG_rtf
-#echo DX_FLAG_xml=$DX_FLAG_xml
-#echo DX_FLAG_pdf=$DX_FLAG_pdf
-#echo DX_FLAG_ps=$DX_FLAG_ps
-#echo DX_ENV=$DX_ENV
-
-
-DESPERATELY_WANT_ASSUMED_SIZE=0
-if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
-
-$as_echo "#define DESPERATELY_WANT_ASSUMED_SIZE 1" >>confdefs.h
-
-fi
-
-
-
-
-
-
-
-
-
-#AC_SUBST(OPT_FCFLAGS)
-DOXYGEN_OUTPUT_DIR=docs
-
-
-rm -rf modules/ .fortran_dependencies/
-mkdir modules
-
-#gl_VISIBILITY
-#AH_BOTTOM([#if HAVE_VISIBILITY
-#define EXPORTED __attribute__((__visibility__("default")))
-#define HIDDEN   __attribute__((__visibility__("hidden")))
-#else
-#define EXPORTED
-#define HIDDEN
-#endif])
-
-
-# Some part of libtool is too smart and tries to parse the output of
-#   gfortran -v
-# and catches anything that resembles a -l library link specification.
-# Unfortunately, recent versions of gfortran emit
-#   -l gfortran
-# with a space between -l and gfortran. The empty -l is then included
-# into "postdeps_FC" and causes linking errors later on.
-postdeps_FC=$(echo $postdeps_FC | sed 's/-l //g')
-
-if test x"${enable_openmp}" = x"yes"; then
-	SUFFIX="_openmp"
-else
-	SUFFIX=""
+        if test x"$pytest_found" != x"yes" ; then
+          as_fn_error $? "pytest not found." "$LINENO" 5
+        fi
 fi
-
-PKG_CONFIG_FILE=elpa${SUFFIX}-${PACKAGE_VERSION}.pc
-
-
-ac_config_files="$ac_config_files Makefile Doxyfile ${PKG_CONFIG_FILE}:elpa.pc.in"
-
-
 cat >confcache <<\_ACEOF
 # This file is a shell script that caches the results of configure
 # tests run on this system so they can be shared between configure
@@ -22792,8 +26237,8 @@
   am__EXEEXT_FALSE=
 fi
 
-if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
-  as_fn_error $? "conditional \"MAINTAINER_MODE\" was never defined.
+if test -z "${OPTIONAL_C_ERROR_ARGUMENT_TRUE}" && test -z "${OPTIONAL_C_ERROR_ARGUMENT_FALSE}"; then
+  as_fn_error $? "conditional \"OPTIONAL_C_ERROR_ARGUMENT\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${WITH_OPENMP_TRUE}" && test -z "${WITH_OPENMP_FALSE}"; then
@@ -22804,6 +26249,10 @@
   as_fn_error $? "conditional \"WITH_MPI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_SCALAPACK_TESTS_TRUE}" && test -z "${WITH_SCALAPACK_TESTS_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_SCALAPACK_TESTS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
   as_fn_error $? "conditional \"AMDEP\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -22812,62 +26261,98 @@
   as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
+if test -z "${HAVE_HETEROGENOUS_CLUSTER_SUPPORT_TRUE}" && test -z "${HAVE_HETEROGENOUS_CLUSTER_SUPPORT_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_HETEROGENOUS_CLUSTER_SUPPORT\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_64BIT_INTEGER_MATH_SUPPORT_TRUE}" && test -z "${HAVE_64BIT_INTEGER_MATH_SUPPORT_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_64BIT_INTEGER_MATH_SUPPORT\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_64BIT_INTEGER_MPI_SUPPORT_TRUE}" && test -z "${HAVE_64BIT_INTEGER_MPI_SUPPORT_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_64BIT_INTEGER_MPI_SUPPORT\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+case $FC_MODINC in #(
+  *\ ) FC_MODINC=$FC_MODINC'${ac_empty}' ;;
+esac
+case $FC_MODOUT in #(
+  *\ ) FC_MODOUT=$FC_MODOUT'${ac_empty}' ;;
+esac
+if test -z "${HAVE_REDIRECT_TRUE}" && test -z "${HAVE_REDIRECT_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_REDIRECT\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_DETAILED_TIMINGS_TRUE}" && test -z "${HAVE_DETAILED_TIMINGS_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_DETAILED_TIMINGS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${BAND_TO_FULL_BLOCKING_TRUE}" && test -z "${BAND_TO_FULL_BLOCKING_FALSE}"; then
+  as_fn_error $? "conditional \"BAND_TO_FULL_BLOCKING\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_REAL_GENERIC_KERNEL_TRUE}" && test -z "${WITH_REAL_GENERIC_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_GENERIC_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE}" && test -z "${WITH_REAL_GENERIC_SIMPLE_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_GENERIC_SIMPLE_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_TRUE}" && test -z "${WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${HAVE_REDIRECT_TRUE}" && test -z "${HAVE_REDIRECT_FALSE}"; then
-  as_fn_error $? "conditional \"HAVE_REDIRECT\" was never defined.
+if test -z "${WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_TRUE}" && test -z "${WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${HAVE_DETAILED_TIMINGS_TRUE}" && test -z "${HAVE_DETAILED_TIMINGS_FALSE}"; then
-  as_fn_error $? "conditional \"HAVE_DETAILED_TIMINGS\" was never defined.
+if test -z "${WITH_COMPLEX_GENERIC_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_GENERIC_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_GENERIC_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${HAVE_SSE_ASSEMBLY_TRUE}" && test -z "${HAVE_SSE_ASSEMBLY_FALSE}"; then
-  as_fn_error $? "conditional \"HAVE_SSE_ASSEMBLY\" was never defined.
+if test -z "${WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_GENERIC_SIMPLE_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${HAVE_SSE_INTRINSICS_TRUE}" && test -z "${HAVE_SSE_INTRINSICS_FALSE}"; then
-  as_fn_error $? "conditional \"HAVE_SSE_INTRINSICS\" was never defined.
+if test -z "${WITH_REAL_SPARC64_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_REAL_SPARC64_BLOCK2_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_SPARC64_BLOCK2_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${HAVE_AVX_TRUE}" && test -z "${HAVE_AVX_FALSE}"; then
-  as_fn_error $? "conditional \"HAVE_AVX\" was never defined.
+if test -z "${WITH_REAL_SPARC64_BLOCK4_KERNEL_TRUE}" && test -z "${WITH_REAL_SPARC64_BLOCK4_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_SPARC64_BLOCK4_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${HAVE_AVX2_TRUE}" && test -z "${HAVE_AVX2_FALSE}"; then
-  as_fn_error $? "conditional \"HAVE_AVX2\" was never defined.
+if test -z "${WITH_REAL_SPARC64_BLOCK6_KERNEL_TRUE}" && test -z "${WITH_REAL_SPARC64_BLOCK6_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_SPARC64_BLOCK6_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-case $FC_MODINC in #(
-  *\ ) FC_MODINC=$FC_MODINC'${ac_empty}' ;;
-esac
-case $FC_MODOUT in #(
-  *\ ) FC_MODOUT=$FC_MODOUT'${ac_empty}' ;;
-esac
-if test -z "${WITH_REAL_GENERIC_KERNEL_TRUE}" && test -z "${WITH_REAL_GENERIC_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_REAL_GENERIC_KERNEL\" was never defined.
+if test -z "${WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_GENERIC_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_GENERIC_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_GENERIC_KERNEL\" was never defined.
+if test -z "${WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_TRUE}" && test -z "${WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE}" && test -z "${WITH_REAL_GENERIC_SIMPLE_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_REAL_GENERIC_SIMPLE_KERNEL\" was never defined.
+if test -z "${WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_TRUE}" && test -z "${WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_GENERIC_SIMPLE_KERNEL\" was never defined.
+if test -z "${WITH_REAL_VSX_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_REAL_VSX_BLOCK2_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_VSX_BLOCK2_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE}" && test -z "${WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_REAL_SSE_ASSEMBLY_KERNEL\" was never defined.
+if test -z "${WITH_REAL_VSX_BLOCK4_KERNEL_TRUE}" && test -z "${WITH_REAL_VSX_BLOCK4_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_VSX_BLOCK4_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_SSE_ASSEMBLY_KERNEL\" was never defined.
+if test -z "${WITH_REAL_VSX_BLOCK6_KERNEL_TRUE}" && test -z "${WITH_REAL_VSX_BLOCK6_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_VSX_BLOCK6_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${WITH_REAL_SSE_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_REAL_SSE_BLOCK2_KERNEL_FALSE}"; then
@@ -22882,6 +26367,22 @@
   as_fn_error $? "conditional \"WITH_REAL_SSE_BLOCK6_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_SSE_BLOCK1_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_SSE_BLOCK1_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_SSE_BLOCK2_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_SSE_BLOCK2_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE}" && test -z "${WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_SSE_ASSEMBLY_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_SSE_ASSEMBLY_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${WITH_REAL_AVX_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_REAL_AVX_BLOCK2_KERNEL_FALSE}"; then
   as_fn_error $? "conditional \"WITH_REAL_AVX_BLOCK2_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -22894,6 +26395,14 @@
   as_fn_error $? "conditional \"WITH_REAL_AVX_BLOCK6_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_AVX_BLOCK1_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_AVX_BLOCK2_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${WITH_REAL_AVX2_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_REAL_AVX2_BLOCK2_KERNEL_FALSE}"; then
   as_fn_error $? "conditional \"WITH_REAL_AVX2_BLOCK2_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -22906,38 +26415,54 @@
   as_fn_error $? "conditional \"WITH_REAL_AVX2_BLOCK6_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_SSE_BLOCK1_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_SSE_BLOCK1_KERNEL\" was never defined.
+if test -z "${WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX2_BLOCK1_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_AVX2_BLOCK1_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_SSE_BLOCK2_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_SSE_BLOCK2_KERNEL\" was never defined.
+if test -z "${WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX2_BLOCK2_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_AVX2_BLOCK2_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_AVX_BLOCK1_KERNEL\" was never defined.
+if test -z "${WITH_REAL_AVX512_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_REAL_AVX512_BLOCK2_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_AVX512_BLOCK2_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_AVX_BLOCK2_KERNEL\" was never defined.
+if test -z "${WITH_REAL_AVX512_BLOCK4_KERNEL_TRUE}" && test -z "${WITH_REAL_AVX512_BLOCK4_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_AVX512_BLOCK4_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX2_BLOCK1_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_AVX2_BLOCK1_KERNEL\" was never defined.
+if test -z "${WITH_REAL_AVX512_BLOCK6_KERNEL_TRUE}" && test -z "${WITH_REAL_AVX512_BLOCK6_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_REAL_AVX512_BLOCK6_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX2_BLOCK2_KERNEL_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_COMPLEX_AVX2_BLOCK2_KERNEL\" was never defined.
+if test -z "${WITH_COMPLEX_AVX512_BLOCK1_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX512_BLOCK1_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_AVX512_BLOCK1_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_COMPLEX_AVX512_BLOCK2_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_AVX512_BLOCK2_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_AVX512_BLOCK2_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${WITH_REAL_BGP_KERNEL_TRUE}" && test -z "${WITH_REAL_BGP_KERNEL_FALSE}"; then
   as_fn_error $? "conditional \"WITH_REAL_BGP_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_COMPLEX_BGP_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_BGP_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_BGP_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${WITH_REAL_BGQ_KERNEL_TRUE}" && test -z "${WITH_REAL_BGQ_KERNEL_FALSE}"; then
   as_fn_error $? "conditional \"WITH_REAL_BGQ_KERNEL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_COMPLEX_BGQ_KERNEL_TRUE}" && test -z "${WITH_COMPLEX_BGQ_KERNEL_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_COMPLEX_BGQ_KERNEL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_GPU_VERSION_TRUE}" && test -z "${WITH_GPU_VERSION_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_GPU_VERSION\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${DX_COND_doc_TRUE}" && test -z "${DX_COND_doc_FALSE}"; then
   as_fn_error $? "conditional \"DX_COND_doc\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -22982,6 +26507,54 @@
   as_fn_error $? "conditional \"DX_COND_latex\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_USE_ASSUMED_SIZE_TRUE}" && test -z "${WITH_USE_ASSUMED_SIZE_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_USE_ASSUMED_SIZE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${USE_FORTRAN2008_TRUE}" && test -z "${USE_FORTRAN2008_FALSE}"; then
+  as_fn_error $? "conditional \"USE_FORTRAN2008\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${ENABLE_AUTOTUNING_TRUE}" && test -z "${ENABLE_AUTOTUNING_FALSE}"; then
+  as_fn_error $? "conditional \"ENABLE_AUTOTUNING\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${ENABLE_C_TESTS_TRUE}" && test -z "${ENABLE_C_TESTS_FALSE}"; then
+  as_fn_error $? "conditional \"ENABLE_C_TESTS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${BUILD_KCOMPUTER_TRUE}" && test -z "${BUILD_KCOMPUTER_FALSE}"; then
+  as_fn_error $? "conditional \"BUILD_KCOMPUTER\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${BUILD_KCOMPUTER_TRUE}" && test -z "${BUILD_KCOMPUTER_FALSE}"; then
+  as_fn_error $? "conditional \"BUILD_KCOMPUTER\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WANT_SINGLE_PRECISION_REAL_TRUE}" && test -z "${WANT_SINGLE_PRECISION_REAL_FALSE}"; then
+  as_fn_error $? "conditional \"WANT_SINGLE_PRECISION_REAL\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WANT_SINGLE_PRECISION_COMPLEX_TRUE}" && test -z "${WANT_SINGLE_PRECISION_COMPLEX_FALSE}"; then
+  as_fn_error $? "conditional \"WANT_SINGLE_PRECISION_COMPLEX\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_SKEWSYMMETRIC_TRUE}" && test -z "${HAVE_SKEWSYMMETRIC_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_SKEWSYMMETRIC\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${STORE_BUILD_CONFIG_TRUE}" && test -z "${STORE_BUILD_CONFIG_FALSE}"; then
+  as_fn_error $? "conditional \"STORE_BUILD_CONFIG\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_PYTHON_TRUE}" && test -z "${WITH_PYTHON_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_PYTHON\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WITH_PYTHON_TESTS_TRUE}" && test -z "${WITH_PYTHON_TESTS_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_PYTHON_TESTS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -23379,7 +26952,7 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by elpa $as_me 2016.05.001, which was
+This file was extended by elpa $as_me 2019.11.001, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -23445,7 +27018,7 @@
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-elpa config.status 2016.05.001
+elpa config.status 2019.11.001
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
@@ -23564,7 +27137,7 @@
 #
 # INIT-COMMANDS
 #
-AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"
+AMDEP_TRUE="$AMDEP_TRUE" MAKE="${MAKE-make}"
 
 
 # The HP-UX ksh and POSIX shell print the target directory to stdout
@@ -23961,6 +27534,9 @@
     "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
     "Doxyfile") CONFIG_FILES="$CONFIG_FILES Doxyfile" ;;
     "${PKG_CONFIG_FILE}") CONFIG_FILES="$CONFIG_FILES ${PKG_CONFIG_FILE}:elpa.pc.in" ;;
+    "elpa/elpa_constants.h") CONFIG_FILES="$CONFIG_FILES elpa/elpa_constants.h" ;;
+    "elpa/elpa_version.h") CONFIG_FILES="$CONFIG_FILES elpa/elpa_version.h" ;;
+    "elpa/elpa_build_config.h") CONFIG_FILES="$CONFIG_FILES elpa/elpa_build_config.h" ;;
 
   *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
   esac
@@ -24560,29 +28136,35 @@
   # Older Autoconf quotes --file arguments for eval, but not when files
   # are listed without --file.  Let's play safe and only enable the eval
   # if we detect the quoting.
-  case $CONFIG_FILES in
-  *\'*) eval set x "$CONFIG_FILES" ;;
-  *)   set x $CONFIG_FILES ;;
-  esac
+  # TODO: see whether this extra hack can be removed once we start
+  # requiring Autoconf 2.70 or later.
+  case $CONFIG_FILES in #(
+  *\'*) :
+    eval set x "$CONFIG_FILES" ;; #(
+  *) :
+    set x $CONFIG_FILES ;; #(
+  *) :
+     ;;
+esac
   shift
-  for mf
+  # Used to flag and report bootstrapping failures.
+  am_rc=0
+  for am_mf
   do
     # Strip MF so we end up with the name of the file.
-    mf=`echo "$mf" | sed -e 's/:.*$//'`
-    # Check whether this is an Automake generated Makefile or not.
-    # We used to match only the files named 'Makefile.in', but
-    # some people rename them; so instead we look at the file content.
-    # Grep'ing the first line is not enough: some people post-process
-    # each Makefile.in and add a new line on top of each file to say so.
-    # Grep'ing the whole file is not good either: AIX grep has a line
+    am_mf=`$as_echo "$am_mf" | sed -e 's/:.*$//'`
+    # Check whether this is an Automake generated Makefile which includes
+    # dependency-tracking related rules and includes.
+    # Grep'ing the whole file directly is not great: AIX grep has a line
     # limit of 2048, but all sed's we know have understand at least 4000.
-    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
-      dirpart=`$as_dirname -- "$mf" ||
-$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$mf" : 'X\(//\)[^/]' \| \
-	 X"$mf" : 'X\(//\)$' \| \
-	 X"$mf" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$mf" |
+    sed -n 's,^am--depfiles:.*,X,p' "$am_mf" | grep X >/dev/null 2>&1 \
+      || continue
+    am_dirpart=`$as_dirname -- "$am_mf" ||
+$as_expr X"$am_mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$am_mf" : 'X\(//\)[^/]' \| \
+	 X"$am_mf" : 'X\(//\)$' \| \
+	 X"$am_mf" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$am_mf" |
     sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
 	    s//\1/
 	    q
@@ -24600,53 +28182,48 @@
 	    q
 	  }
 	  s/.*/./; q'`
-    else
-      continue
-    fi
-    # Extract the definition of DEPDIR, am__include, and am__quote
-    # from the Makefile without running 'make'.
-    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
-    test -z "$DEPDIR" && continue
-    am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "$am__include" && continue
-    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-    # Find all dependency output files, they are included files with
-    # $(DEPDIR) in their names.  We invoke sed twice because it is the
-    # simplest approach to changing $(DEPDIR) to its actual value in the
-    # expansion.
-    for file in `sed -n "
-      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
-      # Make sure the directory exists.
-      test -f "$dirpart/$file" && continue
-      fdir=`$as_dirname -- "$file" ||
-$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$file" : 'X\(//\)[^/]' \| \
-	 X"$file" : 'X\(//\)$' \| \
-	 X"$file" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$file" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
+    am_filepart=`$as_basename -- "$am_mf" ||
+$as_expr X/"$am_mf" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$am_mf" : 'X\(//\)$' \| \
+	 X"$am_mf" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$am_mf" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
 	    s//\1/
 	    q
 	  }
-	  /^X\(\/\/\)$/{
+	  /^X\/\(\/\/\)$/{
 	    s//\1/
 	    q
 	  }
-	  /^X\(\/\).*/{
+	  /^X\/\(\/\).*/{
 	    s//\1/
 	    q
 	  }
 	  s/.*/./; q'`
-      as_dir=$dirpart/$fdir; as_fn_mkdir_p
-      # echo "creating $dirpart/$file"
-      echo '# dummy' > "$dirpart/$file"
-    done
+    { echo "$as_me:$LINENO: cd "$am_dirpart" \
+      && sed -e '/# am--include-marker/d' "$am_filepart" \
+        | $MAKE -f - am--depfiles" >&5
+   (cd "$am_dirpart" \
+      && sed -e '/# am--include-marker/d' "$am_filepart" \
+        | $MAKE -f - am--depfiles) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } || am_rc=$?
   done
+  if test $am_rc -ne 0; then
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "Something went wrong bootstrapping makefile fragments
+    for automatic dependency tracking.  Try re-running configure with the
+    '--disable-dependency-tracking' option to at least be able to build
+    the package (albeit without support for automatic dependency tracking).
+See \`config.log' for more details" "$LINENO" 5; }
+  fi
+  { am_dirpart=; unset am_dirpart;}
+  { am_filepart=; unset am_filepart;}
+  { am_mf=; unset am_mf;}
+  { am_rc=; unset am_rc;}
+  rm -f conftest-deps.mk
 }
  ;;
     "libtool":C)
@@ -25397,17 +28974,543 @@
 fi
 
 
-if test "${can_compile_avx}" = "no" ; then
-#  if test x"${want_avx}" = x"yes" ; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Could not compile AVX instructions" >&5
-$as_echo "$as_me: WARNING: Could not compile AVX instructions" >&2;}
-#  fi
-fi
-if test "${can_compile_avx2}" = "no" ; then
-#  if test x"${want_avx2}" = x"yes" ; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Could not compile AVX2 instructions" >&5
-$as_echo "$as_me: WARNING: Could not compile AVX2 instructions" >&2;}
-#  fi
+echo ""
+echo "The following ELPA2 kernels will be build:"
+echo ""
+
+
+                if test x"$use_real_generic" = x"yes" ; then
+                        echo -n "  real_generic"
+                        if test "$fixed_real_kernel" = "real_generic" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_generic" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_generic_simple" = x"yes" ; then
+                        echo -n "  real_generic_simple"
+                        if test "$fixed_real_kernel" = "real_generic_simple" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_generic_simple" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_generic_simple_block4" = x"yes" ; then
+                        echo -n "  real_generic_simple_block4"
+                        if test "$fixed_real_kernel" = "real_generic_simple_block4" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_generic_simple_block4" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_generic_simple_block6" = x"yes" ; then
+                        echo -n "  real_generic_simple_block6"
+                        if test "$fixed_real_kernel" = "real_generic_simple_block6" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_generic_simple_block6" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_sparc64_block2" = x"yes" ; then
+                        echo -n "  real_sparc64_block2"
+                        if test "$fixed_real_kernel" = "real_sparc64_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_sparc64_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_sparc64_block4" = x"yes" ; then
+                        echo -n "  real_sparc64_block4"
+                        if test "$fixed_real_kernel" = "real_sparc64_block4" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_sparc64_block4" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_sparc64_block6" = x"yes" ; then
+                        echo -n "  real_sparc64_block6"
+                        if test "$fixed_real_kernel" = "real_sparc64_block6" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_sparc64_block6" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_neon_arch64_block2" = x"yes" ; then
+                        echo -n "  real_neon_arch64_block2"
+                        if test "$fixed_real_kernel" = "real_neon_arch64_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_neon_arch64_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_neon_arch64_block4" = x"yes" ; then
+                        echo -n "  real_neon_arch64_block4"
+                        if test "$fixed_real_kernel" = "real_neon_arch64_block4" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_neon_arch64_block4" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_neon_arch64_block6" = x"yes" ; then
+                        echo -n "  real_neon_arch64_block6"
+                        if test "$fixed_real_kernel" = "real_neon_arch64_block6" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_neon_arch64_block6" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_vsx_block2" = x"yes" ; then
+                        echo -n "  real_vsx_block2"
+                        if test "$fixed_real_kernel" = "real_vsx_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_vsx_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_vsx_block4" = x"yes" ; then
+                        echo -n "  real_vsx_block4"
+                        if test "$fixed_real_kernel" = "real_vsx_block4" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_vsx_block4" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_vsx_block6" = x"yes" ; then
+                        echo -n "  real_vsx_block6"
+                        if test "$fixed_real_kernel" = "real_vsx_block6" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_vsx_block6" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_sse_block2" = x"yes" ; then
+                        echo -n "  real_sse_block2"
+                        if test "$fixed_real_kernel" = "real_sse_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_sse_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_sse_block4" = x"yes" ; then
+                        echo -n "  real_sse_block4"
+                        if test "$fixed_real_kernel" = "real_sse_block4" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_sse_block4" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_sse_block6" = x"yes" ; then
+                        echo -n "  real_sse_block6"
+                        if test "$fixed_real_kernel" = "real_sse_block6" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_sse_block6" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_sse_assembly" = x"yes" ; then
+                        echo -n "  real_sse_assembly"
+                        if test "$fixed_real_kernel" = "real_sse_assembly" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_sse_assembly" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx_block2" = x"yes" ; then
+                        echo -n "  real_avx_block2"
+                        if test "$fixed_real_kernel" = "real_avx_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx_block4" = x"yes" ; then
+                        echo -n "  real_avx_block4"
+                        if test "$fixed_real_kernel" = "real_avx_block4" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx_block4" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx_block6" = x"yes" ; then
+                        echo -n "  real_avx_block6"
+                        if test "$fixed_real_kernel" = "real_avx_block6" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx_block6" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx2_block2" = x"yes" ; then
+                        echo -n "  real_avx2_block2"
+                        if test "$fixed_real_kernel" = "real_avx2_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx2_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx2_block4" = x"yes" ; then
+                        echo -n "  real_avx2_block4"
+                        if test "$fixed_real_kernel" = "real_avx2_block4" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx2_block4" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx2_block6" = x"yes" ; then
+                        echo -n "  real_avx2_block6"
+                        if test "$fixed_real_kernel" = "real_avx2_block6" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx2_block6" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx512_block2" = x"yes" ; then
+                        echo -n "  real_avx512_block2"
+                        if test "$fixed_real_kernel" = "real_avx512_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx512_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx512_block4" = x"yes" ; then
+                        echo -n "  real_avx512_block4"
+                        if test "$fixed_real_kernel" = "real_avx512_block4" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx512_block4" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_avx512_block6" = x"yes" ; then
+                        echo -n "  real_avx512_block6"
+                        if test "$fixed_real_kernel" = "real_avx512_block6" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_avx512_block6" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_bgp" = x"yes" ; then
+                        echo -n "  real_bgp"
+                        if test "$fixed_real_kernel" = "real_bgp" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_bgp" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_real_bgq" = x"yes" ; then
+                        echo -n "  real_bgq"
+                        if test "$fixed_real_kernel" = "real_bgq" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_real_kernel" = "real_bgq" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+
+
+                if test x"$use_complex_generic" = x"yes" ; then
+                        echo -n "  complex_generic"
+                        if test "$fixed_complex_kernel" = "complex_generic" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_generic" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_generic_simple" = x"yes" ; then
+                        echo -n "  complex_generic_simple"
+                        if test "$fixed_complex_kernel" = "complex_generic_simple" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_generic_simple" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_sse_block1" = x"yes" ; then
+                        echo -n "  complex_sse_block1"
+                        if test "$fixed_complex_kernel" = "complex_sse_block1" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_sse_block1" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_sse_block2" = x"yes" ; then
+                        echo -n "  complex_sse_block2"
+                        if test "$fixed_complex_kernel" = "complex_sse_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_sse_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_sse_assembly" = x"yes" ; then
+                        echo -n "  complex_sse_assembly"
+                        if test "$fixed_complex_kernel" = "complex_sse_assembly" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_sse_assembly" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_avx_block1" = x"yes" ; then
+                        echo -n "  complex_avx_block1"
+                        if test "$fixed_complex_kernel" = "complex_avx_block1" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_avx_block1" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_avx_block2" = x"yes" ; then
+                        echo -n "  complex_avx_block2"
+                        if test "$fixed_complex_kernel" = "complex_avx_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_avx_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_avx2_block1" = x"yes" ; then
+                        echo -n "  complex_avx2_block1"
+                        if test "$fixed_complex_kernel" = "complex_avx2_block1" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_avx2_block1" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_avx2_block2" = x"yes" ; then
+                        echo -n "  complex_avx2_block2"
+                        if test "$fixed_complex_kernel" = "complex_avx2_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_avx2_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_avx512_block1" = x"yes" ; then
+                        echo -n "  complex_avx512_block1"
+                        if test "$fixed_complex_kernel" = "complex_avx512_block1" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_avx512_block1" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_avx512_block2" = x"yes" ; then
+                        echo -n "  complex_avx512_block2"
+                        if test "$fixed_complex_kernel" = "complex_avx512_block2" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_avx512_block2" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_bgp" = x"yes" ; then
+                        echo -n "  complex_bgp"
+                        if test "$fixed_complex_kernel" = "complex_bgp" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_bgp" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+                if test x"$use_complex_bgq" = x"yes" ; then
+                        echo -n "  complex_bgq"
+                        if test "$fixed_complex_kernel" = "complex_bgq" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_complex_kernel" = "complex_bgq" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+
+
+
+if test x"${ax_cv_have_sse3_cpu_ext}" = x"yes" -a  x"${need_sse}" = x"no"; then
+  echo " "
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You did not request SSE support (--enable-sse), but your local CPU supports it." >&5
+$as_echo "$as_me: WARNING: You did not request SSE support (--enable-sse), but your local CPU supports it." >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You might want to re-configure, except you are cross-compiling" >&5
+$as_echo "$as_me: WARNING: You might want to re-configure, except you are cross-compiling" >&2;}
+  echo " "
+fi
+
+if test x"${ax_cv_have_sse3_cpu_ext}" = x"yes" -a  x"${need_sse_assembly}" = x"no"; then
+  echo " "
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You did not request SSE-ASSEMBLY support (--enable-sse-assembly), but your local CPU supports it." >&5
+$as_echo "$as_me: WARNING: You did not request SSE-ASSEMBLY support (--enable-sse-assembly), but your local CPU supports it." >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You might want to re-configure, except you are cross-compiling" >&5
+$as_echo "$as_me: WARNING: You might want to re-configure, except you are cross-compiling" >&2;}
+  echo " "
+fi
+
+if test x"${ax_cv_have_avx_cpu_ext}" = x"yes" -a  x"${need_avx}" = x"no"; then
+  echo " "
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You did not request AVX support (--enable-avx), but your local CPU supports it." >&5
+$as_echo "$as_me: WARNING: You did not request AVX support (--enable-avx), but your local CPU supports it." >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You might want to re-configure, except you are cross-compiling" >&5
+$as_echo "$as_me: WARNING: You might want to re-configure, except you are cross-compiling" >&2;}
+  echo " "
+fi
+
+if test x"${ax_cv_have_avx2_cpu_ext}" = x"yes" -a  x"${need_avx2}" = x"no"; then
+  echo " "
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You did not request AVX2 support (--enable-avx2), but your local CPU supports it." >&5
+$as_echo "$as_me: WARNING: You did not request AVX2 support (--enable-avx2), but your local CPU supports it." >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You might want to re-configure, except you are cross-compiling" >&5
+$as_echo "$as_me: WARNING: You might want to re-configure, except you are cross-compiling" >&2;}
+  echo " "
+fi
+
+if test x"${ax_cv_have_avx512f_cpu_ext}" = x"yes" -a  x"${need_avx512}" = x"no"; then
+  echo " "
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You did not request AVX512 support (--enable-avx512), but your local CPU supports it." >&5
+$as_echo "$as_me: WARNING: You did not request AVX512 support (--enable-avx512), but your local CPU supports it." >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: You might want to re-configure, except you are cross-compiling" >&5
+$as_echo "$as_me: WARNING: You might want to re-configure, except you are cross-compiling" >&2;}
+  echo " "
+fi
+
+echo " "
+echo "***********************************************************************"
+echo "*  As announced, with this release candidate ELPA 2019.11.001,    *"
+echo "*  the legacy API has been finally removed !                          *"
+echo "***********************************************************************"
+echo " "
+#echo " "
+#echo "***********************************************************************"
+#echo "*  This is a the first release candidate of ELPA 2019.11.001      *"
+#echo "*  There might be still some changes until the final release of       *"
+#echo "*  ELPA 2019.11.001                                                   *"
+#echo "***********************************************************************"
+#echo " "
+
+if test x"$enable_kcomputer" = x"yes" ; then
+  echo " "
+  echo "Important message:"
+  echo "On K-computer (at the moment) the automatic creation of the generated"
+  echo "headers does not work."
+  echo "call: make -f ../generated_headers.am generated-headers top_srcdir=.."
+  echo "BEFORE triggering the build with make!"
+else
+  if test x"$optional_c_error_argument" = x"yes" ; then
+    echo "#define OPTIONAL_C_ERROR_ARGUMENT" > elpa/elpa_generated_c_api.h
+  else
+    echo "#undef OPTIONAL_C_ERROR_ARGUMENT" > elpa/elpa_generated_c_api.h
+  fi
+  if test x"$store_build_config" = x"yes"; then
+    cat config.log > elpa_build_object
+    xxd -i elpa_build_object >> elpa/elpa_build_config.h
+  fi
+
+  make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
 fi
 
-make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir"
diff -Nru elpa-2016.05.001/configure.ac elpa-2019.11.001/configure.ac
--- elpa-2016.05.001/configure.ac	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/configure.ac	2019-12-20 05:57:47.000000000 +0000
@@ -1,26 +1,19 @@
 AC_PREREQ([2.69])
 
-# Remember to change the version also in elpa.spec
-AC_INIT([elpa],[2016.05.001], [elpa-library@mpcdf.mpg.de])
-
+# The version is set in elpa.spec, to have a single point of reference
+AC_INIT([elpa],m4_esyscmd_s([awk '/^ *Version:/ {print $2;}' elpa.spec]), [elpa-library@mpcdf.mpg.de])
 AC_SUBST([PACKAGE_VERSION])
 
-AC_CONFIG_SRCDIR([src/elpa1.F90])
+AC_CONFIG_SRCDIR([src/elpa.F90])
 
 AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
 
-# Without this, automake tries to be smart and rebuilt
-# the autoconf generated files such as configure, aclocal.m4, etc.,
-# in case the timestamps of files such as configure.ac are newer
-#
-# This only makes trouble for end users with out-of-date autoconf versions
-# that cannot produce these files
-AM_MAINTAINER_MODE([disable])
-
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_HEADERS([config.h])
 AM_SILENT_RULES([yes])
 
+# ABI version
+#
 # Set the libtool library version, see LIBRARY_INTERFACE
 #
 # See http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html
@@ -34,49 +27,110 @@
 #    by the current interface, as they are ABI compatible (e.g. only new symbols
 #    were added by the new interface)
 #
-AC_SUBST([ELPA_SO_VERSION], [5:0:1])
-#
+AC_SUBST([ELPA_SO_VERSION], [15:0:0])
 
+# AC_DEFINE_SUBST(NAME, VALUE, DESCRIPTION)
+# -----------------------------------------
+AC_DEFUN([AC_DEFINE_SUBST], [
+AC_DEFINE([$1], [$2], [$3])
+AC_SUBST([$1], ['$2'])
+])
+
+# API Version
+AC_DEFINE([EARLIEST_API_VERSION], [20170403], [Earliest supported ELPA API version])
+
+AC_DEFINE_SUBST(CURRENT_API_VERSION, 20191110, "Current ELPA API version")
+# Autotune Version
+AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, which supports autotuning])
+AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20190524], [Current ELPA autotune version])
+AC_DEFINE_SUBST(CURRENT_AUTOTUNE_VERSION, 20190524, "Current ELPA autotune version")
 
 AX_CHECK_GNU_MAKE()
 if test x$_cv_gnu_make_command = x ; then
         AC_MSG_ERROR([Need GNU Make])
 fi
 
-#AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
-#if test x"${CPP_FOUND}" = xno; then
-#  AC_MSG_ERROR([no cpp found])
-#fi
+enable_legacy=no
+
+AC_MSG_CHECKING(whether in C interface the error argument should be optional)
+AC_ARG_ENABLE([optional-argument-in-C-API],
+              AS_HELP_STRING([--enable-optional-argument-in-C-API],
+                             [do not build C API with error argument as optional, default no]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         optional_c_error_argument=yes
+	       else
+	         optional_c_error_argument=no
+	       fi
+	       ],
+              [optional_c_error_argument=no])
+AC_MSG_RESULT([${optional_c_error_argument}])
+AM_CONDITIONAL([OPTIONAL_C_ERROR_ARGUMENT],[test x"$optional_c_error_argument" = x"yes"])
+if test x"${optional_c_error_argument}" = x"yes"; then
+        AC_DEFINE([OPTIONAL_C_ERROR_ARGUMENT], [1], [enable error argument in C-API to be optional])
+fi
+
 
 # gnu-make fortran module dependencies
 m4_include([fdep/fortran_dependencies.m4])
 FDEP_F90_GNU_MAKE_DEPS
 
-###
 
+dnl OpenMP
 m4_include([m4/ax_elpa_openmp.m4])
-
 AC_MSG_CHECKING(whether --enable-openmp is specified)
 AC_ARG_ENABLE([openmp],
-		AS_HELP_STRING([--enable-openmp],
-			       [use OpenMP threading, default no.]),
-	      [],
-	      [enable_openmp=no])
+              AS_HELP_STRING([--enable-openmp],
+                             [use OpenMP threading, default no.]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         enable_openmp=yes
+	       else
+	         enable_openmp=no
+	       fi
+	       ],
+              [enable_openmp=no])
 AC_MSG_RESULT([${enable_openmp}])
 AM_CONDITIONAL([WITH_OPENMP],[test x"$enable_openmp" = x"yes"])
 if test x"${enable_openmp}" = x"yes"; then
-	AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
+        AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
 fi
 
+
 dnl mpi
 AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi=[[yes|no]]], [compile with MPI. Default: yes])],,[with_mpi=yes])
-AM_CONDITIONAL([WITH_MPI],[test x"with_mpi" = x"yes"])
+AM_CONDITIONAL([WITH_MPI],[test x"$with_mpi" = x"yes"])
 if test x"${with_mpi}" = x"yes"; then
-       AC_DEFINE([WITH_MPI], [1], [use MPI])
+  AC_DEFINE([WITH_MPI], [1], [use MPI])
 fi
 
-# C
-AC_LANG([C])
+
+dnl Scalapack tests
+AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
+AC_ARG_ENABLE([scalapack-tests],
+              AS_HELP_STRING([--enable-scalapack-tests],
+                             [build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         enable_scalapack_tests=yes
+	       else
+	         enable_scalapack_tests=no
+	       fi
+	       ],
+              [enable_scalapack_tests="no"])
+AC_MSG_RESULT([$enable_scalapack_tests])
+if test x"${enable_scalapack_tests}" = x"yes"; then
+        if test x"$with_mpi" = x"no"; then
+                AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
+        fi
+        AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
+fi
+AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
+
+
+dnl C
+AC_LANG_PUSH([C])
+
 AX_PROG_CC_MPI([test x"$with_mpi" = x"yes"],[found_mpi_c=yes],[found_mpi_c=no])
 if test x"$with_mpi" = x"yes"; then
   if test x"$found_mpi_c" = x"no"; then
@@ -92,12 +146,138 @@
   CFLAGS="$OPENMP_CFLAGS $CFLAGS"
 fi
 
+c11_standard=no
+AX_CHECK_COMPILE_FLAG([-std=gnu11], [
+  c11_standard=yes
+], [
+  echo "C compiler cannot compile -std=gnu11 code"
+  echo "testing -std=c11.."
+])
+if test x"$c11_standard" = x"yes"; then
+  CFLAGS+=" -std=gnu11"
+fi
+
+if test x"$c11_standard" = x"no"; then
+  AX_CHECK_COMPILE_FLAG([-std=c11], [
+    c11_standard=yes
+  ], [
+    echo "C compiler cannot compile C11 code"
+    exit -1
+  ])
+  if test x"$c11_standard" = x"yes"; then
+    CFLAGS+=" -std=c11"
+  fi
+fi
+
+AX_EXT
+
+dnl heterogenous-cluster-support
+AC_MSG_CHECKING(whether heterogenous-cluster-support should be enabled)
+AC_ARG_ENABLE([heterogenous-cluster-support],
+              AS_HELP_STRING([--heterogenous-cluster-support],
+                             [allow ELPA to automatically switch to a kernel supported on all CPUs of a cluster (only works for Intel CPUs at the moment), default no. Activate only if necessary has a performance penalty! This feature is exerpimental!]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         enable_heterogenous_cluster_support=yes
+	       else
+	         enable_heterogenous_cluster_support=no
+	       fi
+	       ],
+              [enable_heterogenous_cluster_support="no"])
+AC_MSG_RESULT([$enable_heterogenous_cluster_support])
+if test x"${enable_heterogenous_cluster_support}" = x"yes"; then
+  AC_DEFINE([HAVE_HETEROGENOUS_CLUSTER_SUPPORT], [1], [automatically support clusters with different Intel CPUs])
+fi
+AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"])
+
+dnl 64bit integer support for BLACS/LAPACK/SCALAPACK support
+dnl first long int
+AC_CHECK_SIZEOF([long int])
+size_of_long_int="${ac_cv_sizeof_long_int}"
+
+dnl then 64bit blas
+AC_MSG_CHECKING(whether 64bit integers should be used for math libraries (BLAS/LAPACK/SCALAPACK))
+AC_ARG_ENABLE([64bit-integer-math-support],
+              AS_HELP_STRING([--64bit-integer-math-support],
+                             [allows to link against the 64bit integer versions of the math libraries BLAS, LAPACK, and SCALAPACK]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         enable_64bit_integer_math_support=yes
+	       else
+	         enable_64bit_integer_math_support=no
+	       fi
+	       ],
+              [enable_64bit_integer_math_support="no"])
+AC_MSG_RESULT([$enable_64bit_integer_math_support])
+if test x"${enable_64bit_integer_math_support}" = x"yes"; then
+  dnl at least INTEL MPI does _NOT_ support 64BIT integer mode  for C thus disable C tests in this Case
+  if test x"${enable_c_tests}" = x"yes"; then
+     AC_MSG_ERROR([You cannot both define 64bit integer support and C tests. Reconfigure!])
+  fi
+  dnl check whether long int is the correct data-type in C
+  if test x"${size_of_long_int}" = x"8"; then
+    echo "Found C data-type \"long int\" with 8 bytes"
+  else
+    AC_MSG_ERROR([The C data-type "long int" is only ${size_of_long_int} bytes; Needed is 8 bytes])
+  fi
+
+  AC_DEFINE([HAVE_64BIT_INTEGER_MATH_SUPPORT], [1], [allow to link against the 64bit integer versions of math libraries])
+fi
+AM_CONDITIONAL([HAVE_64BIT_INTEGER_MATH_SUPPORT],[test x"$enable_64bit_integer_math_support" = x"yes"])
+
+dnl then 64bit blas
+AC_MSG_CHECKING(whether 64bit integers should be used for the MPI library)
+AC_ARG_ENABLE([64bit-integer-mpi-support],
+              AS_HELP_STRING([--64bit-integer-mpi-support],
+                             [allows to link against the 64bit integer versions of the MPI library]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         enable_64bit_integer_mpi_support=yes
+	       else
+	         enable_64bit_integer_mpi_support=no
+	       fi
+	       ],
+              [enable_64bit_integer_mpi_support="no"])
+AC_MSG_RESULT([$enable_64bit_integer_mpi_support])
+if test x"${enable_64bit_integer_mpi_support}" = x"yes"; then
+  AC_DEFINE([HAVE_64BIT_INTEGER_MPI_SUPPORT], [1], [allow to link against the 64bit integer versions of the MPI library])
+fi
+AM_CONDITIONAL([HAVE_64BIT_INTEGER_MPI_SUPPORT],[test x"$enable_64bit_integer_mpi_support" = x"yes"])
+
+
+
+AC_MSG_CHECKING(whether C compiler can use _Generic )
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+int main(int argc, char **argv) {
+#define elpa_set(e, name, value, error) _Generic((value), \
+                int: \
+                  elpa_set_integer, \
+                \
+                double: \
+                  elpa_set_double \
+        )(e, name, value, error)
+
+  return 0;
+}
+    ])],
+    [can_compile_generic=yes],
+    [can_compile_generic=no]
+  )
+AC_MSG_RESULT([${can_compile_generic}])
+if test x"$can_compile_generic" != x"yes"; then
+  AC_MSG_ERROR([C compiler cannot handle _Generic statement! Upgrade or change C compiler])
+fi
+AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
+
 AC_PROG_INSTALL
 AM_PROG_AR
 AM_PROG_AS
+AC_PROG_CC_C99
+AM_PROG_CC_C_O
+AC_LANG_POP([C])
 
 # Fortran
-AC_LANG([Fortran])
+AC_LANG_PUSH([Fortran])
 m4_include([m4/ax_prog_fc_mpi.m4])
 AX_PROG_FC_MPI([test x"$with_mpi" = x"yes"],[found_mpi_f=yes],[found_mpi_f=no])
 if test x"$with_mpi" = x"yes"; then
@@ -105,6 +285,14 @@
     AC_MSG_ERROR([Could not compile an MPI Fortran program])
   fi
 fi
+
+AC_FC_SRCEXT([F90])
+AC_FC_FREEFORM
+AC_FC_MODULE_FLAG
+AC_FC_MODULE_OUTPUT_FLAG
+AC_FC_LIBRARY_LDFLAGS
+
+
 if test x"${enable_openmp}" = x"yes"; then
   AX_ELPA_OPENMP
   if test "$ac_cv_prog_fc_openmp" = unsupported; then
@@ -113,281 +301,133 @@
   FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
 fi
 
-## C++
-#AC_LANG([C++])
-#AC_PROG_CXX
-#
-#if test x"${enable_openmp}" = x"yes"; then
-#  AX_ELPA_OPENMP
-#  if test "$ac_cv_prog_cxx_openmp" = unsupported; then
-#    AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
-#  fi
-#  CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
-#fi
-
-
-
-dnl variables needed for the tests
-
-dnl do NOT remove any variables here, until
-dnl 1. you know 100% what you are doing
-dnl 2. you tested ALL configure functionality afterwards
-dnl Otherwise, you most likely break some functionality
-
-dnl as default always define the generic kernels to be build
-dnl this is only unset if gpu_support_only is defined, or
-dnl other specific real/complex kernels are wanted
-
-install_real_generic=yes
-install_real_generic_simple=yes
-
-install_complex_generic=yes
-install_complex_generic_simple=yes
-
-#want_avx=yes
-#want_avx2=yes
-#want_sse=yes
-
-AC_LANG([C])
-
-dnl build with ftimings support
-AC_MSG_CHECKING(whether ELPA should be build with ftimings support)
-AC_ARG_WITH([ftimings],
-		AS_HELP_STRING([--with-ftimings],
-			       [detailed timings, default no.]),
-	      [with_ftimings=yes],
-	      [with_ftimings=no])
-AC_MSG_RESULT([${with_ftimings}])
+dnl check which MPI binray invokes a MPI job
+if test x"$with_mpi" = x"yes"; then
+  AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob srun aprun], [no])
+  if test x"$MPI_BINARY" = x"no"; then
+    AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun, aprun])
+  fi
+fi
 
 dnl build with the possibilty to redirect stdout and stderr
 dnl per MPI task in a file
 AC_MSG_CHECKING(whether stdout/stderr file redirect should be enabled)
-AC_ARG_WITH([redirect],
-		AS_HELP_STRING([--with-redirect],
-			       [for test programs, allow redirection of stdout/stderr per MPI taks in a file (useful for ftimings), default no.]),
-	      [with_redirect=yes],
-	      [with_redirect=no])
-AC_MSG_RESULT([${with_redirect}])
+AC_ARG_ENABLE([redirect],
+              [AS_HELP_STRING([--enable-redirect],
+                              [for test programs, allow redirection of stdout/stderr per MPI taks in a file (useful for timing), default no.])],
+              [
+               if test x"$enableval" = x"yes"; then
+                 enable_redirect=yes
+               else
+                 enable_redirect=no
+               fi
+	       ],
+              [enable_redirect=no])
+AC_MSG_RESULT([${enable_redirect}])
+
+dnl check whether single precision is requested
+AC_MSG_CHECKING(whether ELPA library should contain also single precision functions)
+AC_ARG_ENABLE(single-precision,
+              [AS_HELP_STRING([--enable-single-precision],
+                              [build with single precision])],
+              [want_single_precision="$enableval"],
+              [want_single_precision="no"])
+AC_MSG_RESULT([${want_single_precision}])
+
 
-if test x"${with_redirect}" = x"yes"; then
+dnl redirect
+if test x"${enable_redirect}" = x"yes"; then
   AC_DEFINE([HAVE_REDIRECT], [1], [Redirect stdout and stderr of test programs per MPI tasks to a file])
 fi
-AM_CONDITIONAL([HAVE_REDIRECT],[test x"$with_redirect" = x"yes"])
+AM_CONDITIONAL([HAVE_REDIRECT],[test x"$enable_redirect" = x"yes"])
 
-if test x"${with_ftimings}" = x"yes"; then
-  AC_DEFINE([HAVE_DETAILED_TIMINGS], [1], [Enable more timings])
-  AC_ARG_ENABLE([papi],
-	[AS_HELP_STRING([--disable-papi],[Do not use PAPI to also measure flop count, autodetected by default])],
-	[want_papi=$enableval],[want_papi="auto"])
-  papi_found=unknown
-  if test x"$want_papi" != x"no" ; then
-    AC_CHECK_LIB([papi],[PAPI_library_init],[papi_found="yes"],[papi_found="no"])
-    if test x"$want_papi" = x"yes" ; then
-      if test x"$papi_found" = x"no" ; then
-        AC_MSG_ERROR(["Could not find usable PAPI installation, please adjust CFLAGS, LDFLAGS"])
-      fi
+dnl build with ftimings support
+
+AC_ARG_ENABLE([timings],
+              [AS_HELP_STRING([--disable-timings],
+                              [more detailed timing, default yes])],
+              [
+               if test x"$enableval" = x"yes"; then
+                 enable_timings=yes
+               else
+                 enable_timings=no
+               fi
+	       ],
+              [enable_timings=yes])
+
+if test x"${enable_timings}" = x"yes"; then
+  AC_DEFINE([HAVE_DETAILED_TIMINGS], [1], [Enable more timing])
+fi
+AM_CONDITIONAL([HAVE_DETAILED_TIMINGS], [test x"$enable_timings" = x"yes"])
+
+dnl PAPI for ftimings
+AC_LANG_PUSH([C])
+AC_ARG_WITH([papi],
+            [AS_HELP_STRING([--with-papi],
+                            [Use PAPI to also measure flop count in the detailed timing (--enable-timing), disabled by default])],
+            [
+               if test x"$enableval" = x"yes"; then
+                 with_papi=yes
+               else
+                 with_papi=no
+               fi
+	     ],
+            [with_papi="no"])
+if test x"${enable_timings}" = x"yes"; then
+  if test x"$with_papi" = x"yes" ; then
+    AC_SEARCH_LIBS([PAPI_library_init], [papi], [papi_found=yes], [papi_found=no])
+    if test x"$papi_found" = x"no" ; then
+      AC_MSG_ERROR(["Could not find usable PAPI installation, please install or adjust CFLAGS, LDFLAGS"])
     fi
-  fi
-  if test x"$papi_found" = x"yes"; then
     AC_DEFINE([HAVE_LIBPAPI], [1], [Use the PAPI library])
-    LIBS="-lpapi $LIBS"
   fi
 fi
-AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"])
-
-AC_MSG_CHECKING(whether SSE assembly kernel can be compiled)
-
-$CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null
-if test "$?" == 0; then
-  can_compile_sse_assembly=yes
-  install_real_sse_assembly=yes
-  install_complex_sse_assembly=yes
-else
-  can_compile_sse_assembly=no
-  install_real_sse_assembly=no
-  install_complex_sse_assembly=no
-fi
+AC_LANG_POP([C])
 
-rm -f ./test.o
-AC_MSG_RESULT([${can_compile_sse_assembly}])
-
-dnl check whether on can compile with sse-gcc intrinsics
-AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C)
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
- #include <x86intrin.h>
- int main(int argc, char **argv){
- double* q;
- __m128d h1 = _mm_loaddup_pd(q);
- return 0;
- }
- ])],
- [can_compile_sse_intrinsics=yes],
- [can_compile_sse_intrinsics=no]
-)
-AC_MSG_RESULT([${can_compile_sse_intrinsics}])
-
-if test "${can_compile_sse_intrinsics}" = "yes"; then
-  install_real_sse_intrinsics=yes
-  install_real_sse_block2=yes
-  install_real_sse_block4=yes
-  install_real_sse_block6=yes
-
-  install_complex_sse_intrinsics=yes
-  install_complex_sse_block1=yes
-  install_complex_sse_block2=yes
-else
-  install_real_sse_intrinsics=no
-  install_real_sse_block2=no
-  install_real_sse_block4=no
-  install_real_sse_block6=no
-
-  install_complex_sse_intrinsics=no
-  install_complex_sse_block1=no
-  install_complex_sse_block2=no
-fi
-
-dnl check whether one can compile with avx - gcc intrinsics
-
-dnl first pass: try with specified CFLAGS and CXXFLAGS
-AC_MSG_CHECKING([whether we can compile AVX intrinsics in C])
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
- #include <x86intrin.h>
- int main(int argc, char **argv){
- double* q;
- __m256d a1_1 = _mm256_load_pd(q);
- return 0;
- }
- ])],
- [can_compile_avx=yes],
- [can_compile_avx=no]
-)
-AC_MSG_RESULT([${can_compile_avx}])
-
-#if test "${can_compile_avx}" = "yes" ; then
-#  AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
-#  AC_LANG_PUSH([C++])
-#  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-#   #include <x86intrin.h>
-#   int main(int argc, char **argv){
-#   double* q;
-#   __m256d a1_1 = _mm256_load_pd(q);
-#   return 0;
-#   }
-#   ])],
-#   [can_compile_avx=yes],
-#   [can_compile_avx=no]
-#  )
-#  AC_LANG_POP([C++])
-#  AC_MSG_RESULT([${can_compile_avx}])
-#  if test "${can_compile_avx}" = "no" ; then
-#    AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
-#  fi
-#fi
-
-AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
- #include <x86intrin.h>
- int main(int argc, char **argv){
- double* q;
- __m256d q1 = _mm256_load_pd(q);
- __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
- return 0;
- }
- ])],
- [can_compile_avx2=yes],
- [can_compile_avx2=no]
-)
-AC_MSG_RESULT([${can_compile_avx2}])
-#if test "${can_compile_avx2}" = "yes" ; then
-#  AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
-#  AC_LANG_PUSH([C++])
-#  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-#   #include <x86intrin.h>
-#   int main(int argc, char **argv){
-#   double* q;
-#   __m256d q1 = _mm256_load_pd(q);
-#   __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
-#   return 0;
-#   }
-#   ])],
-#   [can_compile_avx2=yes],
-#   [can_compile_avx2=no]
-#  )
-#  AC_LANG_POP([C++])
-#  AC_MSG_RESULT([${can_compile_avx2}])
-#  if test "${can_compile_avx2}" = "no" ; then
-#    AC_MSG_WARN([Cannot compile C++ with AVX2!])
-#  fi
-#fi
-
-if test "${can_compile_avx}" = "yes" ; then
-  install_real_avx_block2=yes
-  install_real_avx_block4=yes
-  install_real_avx_block6=yes
-
-  install_complex_avx_block1=yes
-  install_complex_avx_block2=yes
-else
-  install_real_avx_block2=no
-  install_real_avx_block4=no
-  install_real_avx_block6=no
-
-  install_complex_avx_block1=no
-  install_complex_avx_block2=no
-fi
-
-if test "${can_compile_avx2}" = "yes" ; then
-  install_real_avx2_block2=yes
-  install_real_avx2_block4=yes
-  install_real_avx2_block6=yes
-
-  install_complex_avx2_block1=yes
-  install_complex_avx2_block2=yes
-else
-  install_real_avx2_block2=no
-  install_real_avx2_block4=no
-  install_real_avx2_block6=no
-
-  install_complex_avx2_block1=no
-  install_complex_avx2_block2=no
-fi
-
-AM_CONDITIONAL([HAVE_SSE_ASSEMBLY],[test x"$can_compile_sse_assembly" = x"yes"])
-if test x"${can_compile_sse_assembly}" = x"yes" ; then
-  AC_DEFINE([HAVE_SSE_ASSEMBLY],[1],[assembly SSE is supported on this CPU])
-fi
-AM_CONDITIONAL([HAVE_SSE_INTRINSICS],[test x"$can_compile_sse_intrinsics" = x"yes"])
-if test x"${can_compile_sse_intrinsics}" = x"yes" ; then
-  AC_DEFINE([HAVE_SSE_INTRINSICS],[1],[gcc intrinsics SSE is supported on this CPU])
-fi
-
-AM_CONDITIONAL([HAVE_AVX],[test x"$can_compile_avx" = x"yes"])
-if test x"${can_compile_avx}" = x"yes" ; then
-  AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU])
-fi
-AM_CONDITIONAL([HAVE_AVX2],[test x"$can_compile_avx2" = x"yes"])
-if test x"${can_compile_avx2}" = x"yes" ; then
-  AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
-fi
-
-dnl set the AVX optimization flags if this option is specified
-AC_MSG_CHECKING(whether AVX optimization flags should be set automatically)
-AC_ARG_WITH([avx-optimization],
-		AS_HELP_STRING([--with-avx-optimization],
-			       [use AVX optimization, default no.]),
-	      [with_avx_optimization=yes],
-	      [with_avx_optimization=no])
-AC_MSG_RESULT([${with_avx_optimization}])
-if test x"${with_avx_optimization}" = x"yes"; then
- CFLAGS="$CFLAGS -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize"
- CXXFLAGS="$CXXFLAGS -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize"
+dnl Likwid
+AC_LANG_PUSH([Fortran])
+AC_ARG_WITH([likwid],
+            [AS_HELP_STRING([--with-likwid=[[yes|no(default)|PATH]]],
+                            [Use Likwid to measure performance in some parts of the library])],
+            [with_likwid="$withval"],
+            [with_likwid="no"])
+
+if test x"$with_likwid" != x"no" ; then
+  if test -d $with_likwid/lib ; then
+    LDFLAGS="-L$with_likwid/lib $LDFLAGS"
+  fi
+  if test -d $with_likwid/lib64 ; then
+    LDFLAGS="-L$with_likwid/lib64 $LDFLAGS"
+  fi
+  if test -d $with_likwid/include ; then
+    FCFLAGS="-I$with_likwid/include $FCFLAGS"
+  fi
+  AC_SEARCH_LIBS([likwid_markerInit], [likwid], [liblikwid_found="yes"], [liblikwid_found="no"])
+  if test x"$liblikwid_found" = x"no" ; then
+    AC_MSG_ERROR([Could not find a usable likwid library, please adjust LDFLAGS])
+  fi
+  AC_MSG_CHECKING([whether we can use the likwid module in a Fortran program])
+  AC_COMPILE_IFELSE([
+       program foo
+       use likwid
+
+       implicit none
+
+       call likwid_markerInit()
+       call likwid_markerThreadInit()
+
+       call likwid_markerStartRegion("foobar")
+       call likwid_markerStopRegion("foobar")
+
+       call likwid_markerClose()
+       end
+    ],
+    [AC_MSG_RESULT([yes])],
+    [AC_MSG_RESULT([no])
+     AC_MSG_ERROR([Could not compile a Fortran program using the likwid module, adjust FCFLAGS])])
+  AC_DEFINE([HAVE_LIKWID], [1], [Use likwid])
 fi
-
-AC_LANG([Fortran])
-AC_FC_FREEFORM
-AC_FC_MODULE_FLAG
-AC_FC_MODULE_OUTPUT_FLAG
-AC_FC_LIBRARY_LDFLAGS
+AC_LANG_POP([Fortran])
 
 save_FCFLAGS=$FCFLAGS
 save_LDFLAGS=$LDFLAGS
@@ -402,7 +442,7 @@
 AC_MSG_CHECKING([whether Fortran module iso_fortran_env is available])
 AC_COMPILE_IFELSE([AC_LANG_SOURCE([
   program test_error_unit
-    use ISO_FORTRAN_ENV, only : error_unit
+    use iso_fortran_env, only : error_unit
     implicit none
 
     write(error_unit,*) "error_unit is defined"
@@ -412,11 +452,13 @@
   [can_use_iso_fortran_env=no]
 )
 AC_MSG_RESULT([${can_use_iso_fortran_env}])
+if test x"${can_use_iso_fortran_env}" = x"yes" ; then
+ AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
+fi
+
 
 dnl check whether one can link with specified MKL (desired method)
 AC_MSG_CHECKING([whether we can compile a Fortran program using MKL])
-
-
 AC_COMPILE_IFELSE([AC_LANG_SOURCE([
   program test_mkl
     use mkl_service
@@ -452,7 +494,7 @@
 else
 
   dnl first check blas
-  AC_SEARCH_LIBS([dgemm],[blas],[have_blas=yes],[have_blas=no])
+  AC_SEARCH_LIBS([dgemm],[openblas satlas blas],[have_blas=yes],[have_blas=no])
   AC_MSG_CHECKING([whether we can link a program with a blas lib])
   AC_MSG_RESULT([${have_blas}])
 
@@ -476,6 +518,7 @@
       LIBS="-l${lib} ${old_LIBS}"
       AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation])
       AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no])
+	       
       AC_MSG_RESULT([${blacs_in_scalapack}])
       if test x"${blacs_in_scalapack}" = x"yes"; then
         break
@@ -535,6 +578,7 @@
 FCFLAGS=$save_FCFLAGS
 LDFLAGS=$save_LDFLAGS
 
+
 dnl check for intrinsic fortran function of 2003 standard
 AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environment_variable"])
 
@@ -549,273 +593,763 @@
   [fortran_can_check_environment=no]
 )
 AC_MSG_RESULT([${fortran_can_check_environment}])
+if test x"${fortran_can_check_environment}" = x"yes" ; then
+ AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can query environment variables])
+fi
 
-dnl now check which kernels can be compiled
-
-dnl the checks for SSE were already done before
-dnl the checks for AVX were already done before
-
-dnl check BGP kernel
-AC_MSG_CHECKING([whether we can compile with BGP intrinsics])
-
-
-AC_LINK_IFELSE([AC_LANG_SOURCE([
-  program test_bgp
-    complex*16 :: y3,q3,h2
-    y3 = fxcpmadd(y3,q3,h2)
-
-  end program
-])],
-  [can_compile_bgp=yes],
-  [can_compile_bgp=no]
-)
-AC_MSG_RESULT([${can_compile_bgp}])
 
-if test x"${can_compile_bgp}" = x"yes" ; then
-  install_real_bgp=yes
-  install_complex_bgp=yes
-else
-  install_real_bgp=no
-  install_complex_bgp=no
+dnl check whether BAND_TO_FULL_BLOCKING is set
+AC_MSG_CHECKING(whether BAND_TO_FLULL_BLOCKING is requested)
+AC_ARG_ENABLE(band-to-full-blocking,[AS_HELP_STRING([--disable-band-to-full-blocking],
+                                                    [build ELPA2 with blocking in band_to_full (default: enabled)])],
+              [
+               if test x"$enableval" = x"yes"; then
+                 enable_band_to_full_blocking=yes
+               else
+                 enable_band_to_full_blocking=no
+               fi
+	       ],
+	      [enable_band_to_full_blocking="yes"])
+AC_MSG_RESULT([${enable_band_to_full_blocking}])
+
+AM_CONDITIONAL([BAND_TO_FULL_BLOCKING],[test x"$enable_band_to_full_blocking" = x"yes"])
+if test x"${enable_band_to_full_blocking}" = x"yes"; then
+       AC_DEFINE([BAND_TO_FULL_BLOCKING], [1], [use blocking in trans_ev_band_to_full])
 fi
 
-dnl check BGQ kernel
-AC_MSG_CHECKING([whether we can compile with BGQ intrinsics])
 
-AC_LINK_IFELSE([AC_LANG_SOURCE([
-  program test_bgq
-    VECTOR(REAL(8))::QPX_h2
-    real*8         :: hh(10,2)
-    QPX_h2 = VEC_SPLATS(hh(2,2))
+AC_ARG_WITH([cuda-path],[AS_HELP_STRING([--with-cuda-path=PATH],[prefix where CUDA is installed @<:@default=auto@:>@])],
+            [CUDA_INSTALL_PATH=$withval], [with_cuda=auto])
+
+AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix where CUDA SDK is installed @<:@default=auto@:>@])],
+            [CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
+
+
+dnl setup nvcc flags and use them in later tests
+user_sets_gpu_compute_capability="no"
+AC_MSG_CHECKING(whether a GPU compute capability is specified)
+AC_ARG_WITH([GPU-compute-capability],
+            [AS_HELP_STRING([--with-GPU-compute-capability=VALUE],
+                            [use compute capability VALUE for GPU version, default: "sm_35"])],
+            [user_sets_gpu_compute_capability="yes"],[cuda_compute_capability="sm_35"])
+AC_MSG_RESULT([${user_sets_gpu_compute_capability}])
+
+
+dnl sanity check whether compute capability setting by user is reasonable
+if test x"${user_sets_gpu_compute_capability}" = x"yes" ; then
+  dnl the user must set a value which starts with "sm_"
+  value=$(echo $withval | cut -c1-3)
+  if test x"${value}" = x"sm_" ; then
+    cuda_compute_capability=$withval
+  else
+    AC_MSG_ERROR([Unknown GPU compute capability set: ${withval}])
+  fi
+fi
 
-  end program
-])],
-  [can_compile_bgq=yes],
-  [can_compile_bgq=no]
-)
-AC_MSG_RESULT([${can_compile_bgq}])
 
-if test x"${can_compile_bgq}" = x"yes" ; then
-  install_real_bgq=yes
-  install_complex_bgq=yes
-else
-  install_real_bgq=no
-  install_complex_bgq=no
+dnl Test possibility of 'use mpi', if requested
+if test x"${with_mpi}" = x"yes" ; then
+  AC_ARG_ENABLE([mpi-module],
+                AS_HELP_STRING([--disable-mpi-module],
+                               [do not use the Fortran MPI module, get interfaces by 'include "mpif.h']),
+                [
+                 if test x"$enableval" = x"yes"; then
+                   enable_mpi_module=yes
+                 else
+                   enable_mpi_module=no
+                 fi
+		 ],
+                [enable_mpi_module=yes])
+  if test x"${enable_mpi_module}" = x"yes" ; then
+    AC_MSG_CHECKING(whether Fortran mpi module can be used)
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+      program test_mpi_module
+        use mpi
+        real :: time
+        time = MPI_WTime()
+      end program
+    ])],
+      [can_use_fortran_mpi_module=yes],
+      [can_use_fortran_mpi_module=no]
+    )
+    AC_MSG_RESULT([${can_use_fortran_mpi_module}])
+    if test x"${can_use_fortran_mpi_module}" = x"yes" ; then
+      AC_DEFINE([HAVE_MPI_MODULE],[1],[can use the Fortran mpi module])
+    else
+      AC_MSG_ERROR([Could not compile a Fortran program with an 'use mpi' statement. You can try again with --disable-mpi-module])
+    fi
+  fi
 fi
+AC_LANG_POP([Fortran])
 
+dnl Assemble the list of kernels to build
+m4_pattern_forbid([elpa_m4])
 
-dnl environment variable setting of kernel
-if test x"${fortran_can_check_environment}" = x"yes" ; then
- AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can querry environment variables])
-fi
+m4_define(elpa_m4_generic_kernels, [
+        real_generic
+        real_generic_simple
+        real_generic_simple_block4
+        real_generic_simple_block6
+        complex_generic
+        complex_generic_simple
+])
 
-dnl last check whether user wants to compile only a specific kernel
-dnl
+m4_define(elpa_m4_sse_assembly_kernels, [
+        real_sse_assembly
+        complex_sse_assembly
+])
 
-m4_include([m4/ax_elpa_specific_kernels.m4])
+m4_define(elpa_m4_sse_kernels, [
+        real_sse_block2
+        real_sse_block4
+        real_sse_block6
+        complex_sse_block1
+        complex_sse_block2
+])
 
-dnl real kernels
-  dnl do not remove this variable it is needed in the macros
-  use_specific_real_kernel=no
+m4_define(elpa_m4_sparc64_kernels, [
+        real_sparc64_block2
+        real_sparc64_block4
+        real_sparc64_block6
+])
 
-  dnl generic kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-kernel-only],[generic-kernel],[install_real_generic])
+m4_define(elpa_m4_neon_arch64_kernels, [
+        real_neon_arch64_block2
+        real_neon_arch64_block4
+        real_neon_arch64_block6
+])
 
-  dnl generic-simple kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple])
+m4_define(elpa_m4_vsx_kernels, [
+        real_vsx_block2
+        real_vsx_block4
+        real_vsx_block6
+])
 
-  dnl sse kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-assembly-kernel-only],[sse-assembly-kernel],[install_real_sse_assembly])
+m4_define(elpa_m4_avx_kernels, [
+        real_avx_block2
+        real_avx_block4
+        real_avx_block6
+        complex_avx_block1
+        complex_avx_block2
+])
 
-  dnl bgp kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp])
+m4_define(elpa_m4_avx2_kernels, [
+        real_avx2_block2
+        real_avx2_block4
+        real_avx2_block6
+        complex_avx2_block1
+        complex_avx2_block2
+])
 
-  dnl bgq kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
+m4_define(elpa_m4_avx512_kernels, [
+        real_avx512_block2
+        real_avx512_block4
+        real_avx512_block6
+        complex_avx512_block1
+        complex_avx512_block2
+])
 
-  dnl real-sse-block2 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2])
+m4_define(elpa_m4_bgp_kernels, [
+        real_bgp
+        complex_bgp
+])
 
-  dnl real-sse-block4 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4])
+m4_define(elpa_m4_bgq_kernels, [
+        real_bgq
+        complex_bgq
+])
 
-  dnl real-sse-block6 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6])
+#m4_define(elpa_m4_gpu_kernels, [
+#        real_gpu
+#        complex_gpu
+#])
+
+m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq])
+
+m4_define(elpa_m4_all_kernels,
+          m4_foreach_w([elpa_m4_type],
+                       elpa_m4_kernel_types,
+                       [m4_foreach_w([elpa_m4_kernel],[elpa_m4_]elpa_m4_type[_kernels],elpa_m4_kernel )]))
+
+m4_define(elpa_m4_real_kernels,
+          m4_foreach_w(elpa_m4_kernel,
+                       elpa_m4_all_kernels,
+                       [m4_bmatch(elpa_m4_kernel,real,elpa_m4_kernel) ]))
+
+m4_define(elpa_m4_complex_kernels,
+          m4_foreach_w(elpa_m4_kernel,
+                       elpa_m4_all_kernels,
+                       [m4_bmatch(elpa_m4_kernel,complex,elpa_m4_kernel) ]))
 
-  dnl real-avx-block2 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
+dnl
+dnl ELPA_SELECT_KERNELS([flagname],[default])
+dnl
+dnl  default should be one of `enable' or `disable'
+dnl
+AC_DEFUN([ELPA_SELECT_KERNELS], [
+        AC_ARG_ENABLE(m4_bpatsubst($1,[_],[-]),[AS_HELP_STRING([--]m4_case([$2],[enable],[disable],[disable],[enable])[-]m4_bpatsubst($1,[_],[-]),
+                m4_case([$2],[enable],[do not build],[disable],[build])[ ]m4_toupper($1)[ kernels, default: $2d])],
+                      [],[enable_$1=]m4_case([$2],[enable],[yes],[disable],[no]))
+        m4_foreach_w(elpa_m4_kernel,elpa_m4_$1_kernels,[
+                use_[]elpa_m4_kernel[=$enable_$1]
+        ])
+])
 
-  dnl real-avx-block4 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block4-kernel]-only,[real-avx-block4-kernel],[install_real_avx_block4])
+dnl Modify list of kernels with configure arguments
+ELPA_SELECT_KERNELS([generic],[enable])
+ELPA_SELECT_KERNELS([sparc64],[disable])
+ELPA_SELECT_KERNELS([neon_arch64],[disable])
+ELPA_SELECT_KERNELS([vsx],[disable])
+ELPA_SELECT_KERNELS([sse],[enable])
+ELPA_SELECT_KERNELS([sse_assembly],[enable])
+ELPA_SELECT_KERNELS([avx],[enable])
+ELPA_SELECT_KERNELS([avx2],[enable])
+ELPA_SELECT_KERNELS([avx512],[enable])
+#ELPA_SELECT_KERNELS([gpu],[disable])
+ELPA_SELECT_KERNELS([bgp],[disable])
+ELPA_SELECT_KERNELS([bgq],[disable])
 
-  dnl real-avx-block6 kernel
-  DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block6-kernel-only],[real-avx-block6-kernel],[install_real_avx_block6])
+m4_foreach_w([elpa_m4_kind],[real complex],[
+        default_[]elpa_m4_kind[]_kernel=""
+])
 
+if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
+        m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
+                if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
+                        echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
+                fi
+                use_[]elpa_m4_kernel[]=no
+        ])
+fi
 
-dnl complex kernels
 
-  dnl do not remove this variable it is needed in the macros
-  use_specific_complex_kernel=no
+dnl Deal with --with-fixed-[real|complex]-kernel arguments
+m4_foreach_w([elpa_m4_kind],[real complex],[
+        AC_ARG_WITH([fixed-]elpa_m4_kind[-kernel], m4_expand([AS_HELP_STRING([--with-fixed-]elpa_m4_kind[-kernel]=KERNEL,
+                [compile with only a single specific ]elpa_m4_kind[ kernel. Available kernels are:]
+                 m4_foreach_w([elpa_m4_kernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[m4_bpatsubst(elpa_m4_kernel,elpa_m4_kind[]_,[]) ]))]),
+                 [fixed_]elpa_m4_kind[_kernel="]elpa_m4_kind[_$withval"],[fixed_]elpa_m4_kind[_kernel=""])
+        if test -n "$fixed_[]elpa_m4_kind[]_kernel" ; then
+                found="no"
+                m4_foreach_w([elpa_m4_otherkernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[
+                        if test "$fixed_]elpa_m4_kind[_kernel" = "]elpa_m4_otherkernel[" ; then
+                                use_[]elpa_m4_otherkernel[]=yes
+                                found="yes"
+                        else
+                                use_[]elpa_m4_otherkernel[]=no
+                        fi
+                ])
+                if test x"$found" = x"no" ; then
+                        AC_MSG_ERROR([Invalid kernel "$fixed_]elpa_m4_kind[_kernel" specified for --with-fixed-]elpa_m4_kind[-kernel])
+                fi
+                default_[]elpa_m4_kind[]_kernel="$fixed_[]elpa_m4_kind[]_kernel"
+                AC_DEFINE([WITH_FIXED_]m4_toupper(elpa_m4_kind)[_KERNEL],[1],[use only one specific ]elpa_m4_kind[ kernel (set at compile time)])
+        fi
+])
 
-  dnl generic kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-kernel-only],[generic-kernel],[install_complex_generic])
+#AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
+#            [Compile and always use the GPU version])],
+#            [],[with_gpu_support_only=no])
+#if test x"$with_gpu_support_only" = x"yes" ; then
+#        m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
+#                      use_[]elpa_m4_kernel[]=no
+#        ])
+#        use_real_gpu=yes
+#        use_complex_gpu=yes
+#fi
 
-  dnl generic-simple kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple])
 
-  dnl sse kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-assembly-kernel-only],[sse-assembly-kernel],[install_complex_sse_assembly])
+dnl
+dnl  ELPA_KERNEL_DEPENDS([kernel],[other kernels])
+dnl
+dnl   Switch on each of the other kernels if the given kernel is selected
+dnl
+AC_DEFUN([ELPA_KERNEL_DEPENDS],[
+          if test x"$use_$1" = x"yes"; then
+                  m4_foreach_w([elpa_m4_requiredkernel],[$2],[
+                               if test x"$use_[]elpa_m4_requiredkernel[]" = x"no" ; then
+                                       echo "Enabling elpa_m4_requiredkernel kernel, is a prerequisite for $1"
+                               fi
+                               use_[]elpa_m4_requiredkernel[]=yes
+                  ])
+          fi
+])
+m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512],[
+        ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
+        ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
+        ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
+])
 
-  dnl complex-bqp kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp])
+m4_foreach_w([elpa_m4_type],elpa_m4_kernel_types,[
+        need_[]elpa_m4_type=no
+        need_[]elpa_m4_type[]_kernels=""
+        m4_foreach_w([elpa_m4_kernel],m4_expand([elpa_m4_]elpa_m4_type[_kernels]),[
+                if test x"$use_[]elpa_m4_kernel" = x"yes" ; then
+                        need_[]elpa_m4_type=yes
+                        need_[]elpa_m4_type[]_kernels="$need_[]elpa_m4_type[]_kernels elpa_m4_kernel"
+                fi
+        ])
+])
+m4_foreach_w([elpa_m4_type],elpa_m4_kernel_types,[
+        if test x"$need_[]elpa_m4_type[]" = x"yes" ; then
+               echo "Using m4_toupper(elpa_m4_type) for kernels$need_[]elpa_m4_type[]_kernels"
+        fi
+])
+m4_foreach_w([elpa_m4_type],elpa_m4_kernel_types,[
+        if test x"$need_[]elpa_m4_type[]" != x"yes" ; then
+               echo "Not using m4_toupper(elpa_m4_type) as no selected kernel needs it"
+        fi
+])
 
-  dnl complex-bqq kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
 
-  dnl complex-sse-block1 kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1])
+dnl the list of kernels is now assembled
+dnl choosing a default kernel
 
-  dnl complex-avx-block2 kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2])
+m4_foreach_w([elpa_m4_kind],[real complex],[
+        AC_ARG_WITH([default-]elpa_m4_kind[-kernel], m4_expand([AS_HELP_STRING([--with-default-]elpa_m4_kind[-kernel]=KERNEL,
+                [set a specific ]elpa_m4_kind[ kernel as default kernel. Available kernels are:]
+                 m4_foreach_w([elpa_m4_kernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[m4_bpatsubst(elpa_m4_kernel,elpa_m4_kind[]_,[]) ]))]),
+                 [default_]elpa_m4_kind[_kernel="]elpa_m4_kind[_$withval"],[default_]elpa_m4_kind[_kernel=""])
+        #if test -n "$default_[]elpa_m4_kind[]_kernel" ; then
+        #        found="no"
+        #        m4_foreach_w([elpa_m4_otherkernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[
+        #                if test "$default_]elpa_m4_kind[_kernel" = "]elpa_m4_otherkernel[" ; then
+        #                        use_[]elpa_m4_otherkernel[]=yes
+        #                        found="yes"
+        #                else
+        #                        use_[]elpa_m4_otherkernel[]=no
+        #                fi
+        #        ])
+        #        if test x"$found" = x"no" ; then
+        #                AC_MSG_ERROR([Invalid kernel "$default_]elpa_m4_kind[_kernel" specified for --with-default-]elpa_m4_kind[-kernel])
+        #        fi
+        #        AC_DEFINE([WITH_DEFAULT_]m4_toupper(elpa_m4_kind)[_KERNEL],[1],[use specific ]elpa_m4_kind[ default kernel (set at compile time)])
+        #fi
+])
 
-  dnl complex-avx-block1 kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
 
-  dnl complex-avx-block2 kernel
-  DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
 
-dnl set the conditionals according to the previous tests
+m4_foreach_w([elpa_m4_kind],[real complex],[
+        m4_foreach_w([elpa_m4_kernel],
+                     m4_foreach_w([elpa_m4_cand_kernel],
+                                  elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
+                                  [m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
+                     [
+                     if test -z "$default_[]elpa_m4_kind[]_kernel"; then
+                             if test x"$use_[]elpa_m4_kernel" = x"yes"; then
+                                     default_[]elpa_m4_kind[]_kernel="elpa_m4_kernel"
+                             fi
+                     fi
+                     ])
+        if test -z "$default_[]elpa_m4_kind[]_kernel"; then
+                AC_MSG_ERROR([Internal error, could not determine a default kernel])
+        fi
+        # find the number of this kernel
+        ELPA_2STAGE_[]m4_toupper(elpa_m4_kind)[]_DEFAULT=`grep -i '^ *X(ELPA_2STAGE_'$default_[]elpa_m4_kind[]_kernel'\>' $srcdir/elpa/elpa_constants.h.in | \
+                                                                perl -pe 's/^[[^,]]*, *//; s/,.*//;'`
+        AC_SUBST([ELPA_2STAGE_]m4_toupper(elpa_m4_kind)[_DEFAULT])
+])
 
-if test x"${can_use_iso_fortran_env}" = x"yes" ; then
- AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
+dnl #include <fjmfunc.h>
+dnl #include <emmintrin.h>
+dnl int main(int argc, char **argv) {
+dnl   __m128d q;
+dnl   __m128d h1 = _fjsp_neg_v2r8(q);
+dnl   return 0;
+dnl }
+AC_LANG_PUSH([C])
+
+if test x"${need_vsx}" = x"yes"; then
+  AC_MSG_CHECKING(whether we can compile Altivec VSX with intrinsics in C)
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+#include <altivec.h>
+int main(int argc, char **argv) {
+  __vector double a, b, c;
+  c = vec_add(a,b);
+  return 0;
+}
+    ])],
+    [can_compile_vsx=yes],
+    [can_compile_vsx=no]
+  )
+  AC_MSG_RESULT([${can_compile_vsx}])
+  if test x"$can_compile_vsx" != x"yes"; then
+    AC_MSG_ERROR([Could not compile test program, try with --disable-vsx, or adjust the C compiler or CFLAGS])
+  fi
+  AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
 fi
 
-AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"])
-if test x"${install_real_generic}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_GENERIC_KERNEL],[1],[can use real generic kernel])
+if test x"${need_sparc64}" = x"yes"; then
+  AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+#include <fjmfunc.h>
+#include <emmintrin.h>
+int main(int argc, char **argv) {
+  __m128d  tau1;
+  __m128d h1 = _fjsp_neg_v2r8(tau1);
+  return 0;
+}
+    ])],
+    [can_compile_sparc64=yes],
+    [can_compile_sparc64=no]
+  )
+  AC_MSG_RESULT([${can_compile_sparc64}])
+  if test x"$can_compile_sparc64" != x"yes"; then
+    AC_MSG_ERROR([Could not compile test program, try with --disable-sparc64, or adjust the C compiler or CFLAGS])
+  fi
+  AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU])
 fi
 
-AM_CONDITIONAL([WITH_COMPLEX_GENERIC_KERNEL],[test x"$install_complex_generic" = x"yes"])
-if test x"${install_complex_generic}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_GENERIC_KERNEL],[1],[can use complex generic kernel])
+if test x"${need_neon_arch64}" = x"yes"; then
+  AC_MSG_CHECKING(whether we can compile NEON ARCH64 with intrinsics in C)
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+#include <arm_neon.h>
+int main(int argc, char **argv) {
+  __Float64x2_t  x1, x2, x3, x4;
+  x4 = vfmaq_f64(x1, x2, x3);
+  return 0;
+}
+    ])],
+    [can_compile_neon_arch64=yes],
+    [can_compile_neon_arch64=no]
+  )
+  AC_MSG_RESULT([${can_compile_neon_arch64}])
+  if test x"$can_compile_neon_arch64" != x"yes"; then
+    AC_MSG_ERROR([Could not compile test program, try with --disable-neon_arch64, or adjust the C compiler or CFLAGS])
+  fi
+  AC_DEFINE([HAVE_NEON_ARCH64_SSE],[1],[NEON_ARCH64 intrinsics are supported on this CPU])
 fi
 
-AM_CONDITIONAL([WITH_REAL_GENERIC_SIMPLE_KERNEL],[test x"$install_real_generic_simple" = x"yes"])
-if test x"${install_real_generic_simple}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_GENERIC_SIMPLE_KERNEL],[1],[can use real generic-simple kernel])
-fi
 
-AM_CONDITIONAL([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[test x"$install_complex_generic_simple" = x"yes"])
-if test x"${install_complex_generic_simple}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[1],[can use complex generic-simple kernel])
+if test x"${need_sse}" = x"yes"; then
+  AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+#include <x86intrin.h>
+int main(int argc, char **argv) {
+  double* q;
+  __m128d h1 = _mm_loaddup_pd(q);
+  return 0;
+}
+    ])],
+    [can_compile_sse=yes],
+    [can_compile_sse=no]
+  )
+  AC_MSG_RESULT([${can_compile_sse}])
+  if test x"$can_compile_sse" != x"yes"; then
+    AC_MSG_ERROR([Could not compile test program, try with --disable-sse, or adjust the C compiler or CFLAGS])
+  fi
+  AC_DEFINE([HAVE_SSE_INTRINSICS],[1],[gcc intrinsics SSE is supported on this CPU])
 fi
 
-AM_CONDITIONAL([WITH_REAL_SSE_ASSEMBLY_KERNEL],[test x"$install_real_sse_assembly" = x"yes"])
-if test x"${install_real_sse_assembly}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_SSE_ASSEMBLY_KERNEL],[1],[can use real SSE assembly kernel])
-fi
 
-AM_CONDITIONAL([WITH_COMPLEX_SSE_ASSEMBLY_KERNEL],[test x"$install_complex_sse_assembly" = x"yes"])
-if test x"${install_complex_sse_assembly}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_SSE_ASSEMBLY_KERNEL],[1],[can use complex SSE assembly kernel])
-fi
+if test x"${need_sse_assembly}" = x"yes"; then
+  AC_MSG_CHECKING(whether double-precision SSE assembly kernels can be compiled)
 
-AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"])
-if test x"${install_real_sse_block2}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel])
-fi
+  $CC $CFLAGS -c $srcdir/src/elpa2/kernels/asm_x86_64_double_precision.s -o conftest.o 2>&5
 
-AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"])
-if test x"${install_real_sse_block4}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel])
-fi
+  if test "$?" == 0; then
+    can_compile_sse_asm_double=yes
+  else
+    can_compile_sse_asm_double=no
+  fi
+  rm -f ./conftest.o
+  AC_MSG_RESULT([${can_compile_sse_asm_double}])
+  if test x"$can_compile_sse_asm_double" != x"yes"; then
+    AC_MSG_ERROR([Could not compile test program, try with --disable-sse-assembly, or adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
+  fi
 
-AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"])
-if test x"${install_real_sse_block6}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel])
-fi
+  if test x"${want_single_precision}" = x"yes" ; then
+    AC_MSG_CHECKING(whether single-precision SSE assembly kernels can be compiled)
 
-AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"])
-if test x"${install_real_avx_block2}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel])
-fi
+    $CC $CFLAGS -c $srcdir/src/elpa2/kernels/asm_x86_64_single_precision.s -o conftest.o 2>&5
 
-AM_CONDITIONAL([WITH_REAL_AVX_BLOCK4_KERNEL],[test x"$install_real_avx_block4" = x"yes"])
-if test x"${install_real_avx_block4}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX_BLOCK4_KERNEL],[1],[can use real_avx_block4 kernel])
+    if test "$?" == 0; then
+      can_compile_sse_asm_single=yes
+    else
+      can_compile_sse_asm_single=no
+    fi
+    rm -f ./conftest.o
+    AC_MSG_RESULT([${can_compile_sse_asm_single}])
+    if test x"$can_compile_sse_asm_single" != x"yes"; then
+      AC_MSG_ERROR([Could not compile test program, try with --disable-sse-assembly, or adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
+    fi
+  fi
 fi
 
-AM_CONDITIONAL([WITH_REAL_AVX_BLOCK6_KERNEL],[test x"$install_real_avx_block6" = x"yes"])
-if test x"${install_real_avx_block6}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel])
-fi
 
-AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"])
-if test x"${install_real_avx2_block2}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel])
-fi
 
-AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"])
-if test x"${install_real_avx2_block4}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel])
+if test x"${need_avx}" = x"yes"; then
+  dnl check whether one can compile AVX gcc intrinsics
+  AC_MSG_CHECKING([whether we can compile AVX gcc intrinsics in C])
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+   #include <x86intrin.h>
+   int main(int argc, char **argv){
+   double* q;
+   __m256d a1_1 = _mm256_load_pd(q);
+   return 0;
+   }
+   ])],
+   [can_compile_avx=yes],
+   [can_compile_avx=no]
+  )
+  AC_MSG_RESULT([${can_compile_avx}])
+  if test x"$can_compile_avx" != x"yes"; then
+    AC_MSG_ERROR([Could not compile a test program with AVX, try with --disable-avx, or adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
+  fi
+  AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU])
 fi
 
-AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"])
-if test x"${install_real_avx2_block6}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel])
-fi
 
-AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"])
-if test x"${install_complex_sse_block1}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel])
+if test x"${need_avx2}" = x"yes"; then
+  AC_MSG_CHECKING([whether we can compile AVX2 gcc intrinsics in C])
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+   #include <x86intrin.h>
+   int main(int argc, char **argv){
+   double* q;
+   __m256d q1 = _mm256_load_pd(q);
+   __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
+   return 0;
+   }
+   ])],
+   [can_compile_avx2=yes],
+   [can_compile_avx2=no]
+  )
+  AC_MSG_RESULT([${can_compile_avx2}])
+  if test x"$can_compile_avx2" != x"yes"; then
+    AC_MSG_ERROR([Could not compile a test program with AVX2, try with --disable-avx2, or adjust the C compiler or CFLAGS.  Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
+  fi
+  AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU])
 fi
 
-AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"])
-if test x"${install_complex_sse_block2}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel])
-fi
 
-AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"])
-if test x"${install_complex_avx_block1}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel])
-fi
+if test x"${need_avx512}" = x"yes"; then
+  AC_MSG_CHECKING([whether we can compile AVX512 gcc intrinsics in C])
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+   #include <x86intrin.h>
+   int main(int argc, char **argv){
+   double* q;
+   __m512d q1 = _mm512_load_pd(q);
+   __m512d y1 = _mm512_fmadd_pd(q1, q1, q1);
+   return 0;
+   }
+   ])],
+   [can_compile_avx512=yes],
+   [can_compile_avx512=no]
+  )
+  AC_MSG_RESULT([${can_compile_avx512}])
+  if test x"$can_compile_avx512" != x"yes"; then
+    AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS. Possibly (some of) the flags " $SIMD_FLAGS " solve this issue])
+  fi
+  AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
 
-AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[test x"$install_complex_avx_block2" = x"yes"])
-if test x"${install_complex_avx_block2}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel])
-fi
+  if test x"$can_compile_avx512" = x"yes"; then
+    AC_MSG_CHECKING([whether we compile for Xeon])
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+     #include <x86intrin.h>
+     int main(int argc, char **argv){
+     __m512d sign;
+     __m512d h1_real;
+
+     __m512d x1 = _mm512_xor_pd(h1_real, sign);
+     return 0;
+     }
+     ])],
+     [can_compile_avx512_xeon=yes],
+     [can_compile_avx512_xeon=no]
+    )
+    AC_MSG_RESULT([${can_compile_avx512_xeon}])
+
+    AC_MSG_CHECKING([whether we compile for Xeon PHI])
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+     #include <x86intrin.h>
+     int main(int argc, char **argv){
+     __m512d sign;
+     __m512d h1;
+     __m512d h2_real;
+
+     __m512d x1 = (__m512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
+     return 0;
+     }
+     ])],
+     [can_compile_avx512_xeon_phi=yes],
+     [can_compile_avx512_xeon_phi=no]
+    )
+    AC_MSG_RESULT([${can_compile_avx512_xeon_phi}])
+
+    # this is needed for the intel compiler
+    if test x"$can_compile_avx512_xeon" = x"yes" ; then
+      if test x"$can_compile_avx512_xeon_phi" = x"yes" ; then
+        # we want only one to be true; this is ugly but could not come up with a better way
+        grep Phi /proc/cpuinfo > /dev/null
+        if test x"$?" = x"0" ; then
+	  echo "Xeon PHI found ... disabling AVX512 Xeon"
+          can_compile_avx512_xeon=no
+        fi
+      fi
+    fi
 
-AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"])
-if test x"${install_complex_avx2_block1}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel])
+    if test x"$can_compile_avx512_xeon" = x"yes"; then
+      AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
+    else
+      if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
+        AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU])
+      else
+        AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!])
+      fi
+    fi
+  fi
 fi
+AC_LANG_POP([C])
+
+
+AC_LANG_PUSH([Fortran])
+if test x"${need_bgp}" = x"yes"; then
+  AC_MSG_CHECKING([whether we can compile with BGP intrinsics])
+  AC_LINK_IFELSE([AC_LANG_SOURCE([
+    program test_bgp
+      complex*16 :: y3,q3,h2
+      y3 = fxcpmadd(y3,q3,h2)
 
-AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"])
-if test x"${install_complex_avx2_block2}" = x"yes" ; then
- AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel])
+    end program
+  ])],
+    [can_compile_bgp=yes],
+    [can_compile_bgp=no]
+  )
+  AC_MSG_RESULT([${can_compile_bgp}])
+  if test x"$can_compile_bgp" != x"yes"; then
+    AC_MSG_ERROR([Could not compile a test program with BGP intrinsics, adjust the FC compiler or FCFLAGS])
+  fi
 fi
 
-AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"])
-if test x"${install_real_bgp}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel])
+
+if test x"${need_bgq}" = x"yes"; then
+  AC_MSG_CHECKING([whether we can compile with BGQ intrinsics])
+  AC_LINK_IFELSE([AC_LANG_SOURCE([
+    program test_bgq
+      VECTOR(REAL(8))::QPX_h2
+      real*8         :: hh(10,2)
+      QPX_h2 = VEC_SPLATS(hh(2,2))
+
+    end program
+  ])],
+    [can_compile_bgq=yes],
+    [can_compile_bgq=no]
+  )
+  AC_MSG_RESULT([${can_compile_bgq}])
+  if test x"$can_compile_bgq" != x"yes"; then
+    AC_MSG_ERROR([Could not compile a test program with BGQ intrinsics, adjust the FC compiler or FCFLAGS])
+  fi
 fi
+AC_LANG_POP([Fortran])
+
+
+AC_MSG_CHECKING(whether GPU version should be used)
+AC_ARG_ENABLE([gpu],
+                AS_HELP_STRING([--enable-gpu],
+                               [do use GPU version]),
+                               [if test x"$enableval" = x"yes"; then
+                                  use_gpu=yes
+                                else
+                                  use_gpu=no
+                                fi],
+              [use_gpu=no])
+AC_MSG_RESULT([${use_gpu}])
+if test x"${use_gpu}" = x"yes" ; then
+  need_gpu=yes
+  use_real_gpu=yes
+  use_complex_gpu=yes
+fi
+
+if test x"${need_gpu}" = x"yes" ; then
+  AC_LANG_PUSH([C])
+  CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
+  LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
+  NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
+  NVCC="nvcc"
+  AC_SUBST(NVCC)
+  AC_SUBST(NVCCFLAGS)
+
+  dnl check whether nvcc compiler is found
+  AC_CHECK_PROG(nvcc_found,nvcc,yes,no)
+  if test x"${nvcc_found}" = x"no" ; then
+    AC_MSG_ERROR([nvcc not found; try to set the cuda-path or disable GPU support])
+  fi
 
-AM_CONDITIONAL([WITH_REAL_BGQ_KERNEL],[test x"$install_real_bgq" = x"yes"])
-if test x"${install_real_bgq}" = x"yes" ; then
- AC_DEFINE([WITH_REAL_BGQ_KERNEL],[1],[can use real BGQ kernel])
+  dnl check whether we find cublas
+  AC_SEARCH_LIBS([cublasDgemm],[cublas],[have_cublas=yes],[have_cublas=no])
+  if test x"${have_cublas}" = x"no"; then
+    AC_MSG_ERROR([Could not link cublas; try to set the cuda-path or disable GPU support])
+  fi
+  AC_SEARCH_LIBS([cudaMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
+  if test x"${have_cudart}" = x"no"; then
+    AC_MSG_ERROR([Could not link cudart; try to set the cuda-path or disable GPU support])
+  fi
+  AC_LANG_POP([C])
 fi
 
-if test x"${use_specific_complex_kernel}" = x"no" ; then
- AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)])
-else
- AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)])
+AC_MSG_CHECKING(whether GPU memory debugging should be enabled)
+AC_ARG_ENABLE([gpu-memory-debug],
+                AS_HELP_STRING([--enable-gpu-memory-debug],
+                               [Output information on GPU memory to be processed by utils/memory/check_memory.py]),
+                               [if test x"$enableval" = x"yes"; then
+                                  enable_gpu_memory_debug=yes
+                                else
+                                  enable_gpu_memory_debug=no
+                                fi],
+              [enable_gpu_memory_debug=no])
+AC_MSG_RESULT([${enable_gpu_memory_debug}])
+if test x"${enable_gpu_memory_debug}" = x"yes" ; then
+  AC_DEFINE([DEBUG_CUDA],[1],[enable CUDA debugging])
 fi
 
-if test x"${use_specific_real_kernel}" = x"no" ; then
-  AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
+
+m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
+  AM_CONDITIONAL([WITH_]m4_toupper(elpa_m4_kernel)[_KERNEL],[test x"$use_[]elpa_m4_kernel[]" = x"yes"])
+  if test x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
+    AC_DEFINE([WITH_]m4_toupper(elpa_m4_kernel)[_KERNEL],[1],[Build elpa_m4_kernel kernel])
+    kernel_defined=1
+  else
+    kernel_defined=0
+  fi
+  [ELPA_2STAGE_]m4_toupper(elpa_m4_kernel)[_COMPILED]=$kernel_defined
+  AC_SUBST([ELPA_2STAGE_]m4_toupper(elpa_m4_kernel)[_COMPILED])
+])
+
+AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
+if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
+  AC_DEFINE([WITH_GPU_VERSION],[1],[enable GPU support])
+  #AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
+  ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
+  ELPA_2STAGE_REAL_GPU_COMPILED=1
+
+  AC_MSG_CHECKING(whether --enable-nvtx is specified)
+  AC_ARG_ENABLE([nvtx],
+                AS_HELP_STRING([--enable-nvtx],
+                               [build and install nvtx wrapper for profiling th GPU version, default no.]),
+                [
+                 if test x"$enableval" = x"yes"; then
+                   enable_nvtx=yes
+                 else
+                   enable_nvtx=no
+                 fi
+                 ],
+                [enable_nvtx=no])
+  AC_MSG_RESULT([${enable_nvtx}])
+  if test x"${enable_nvtx}" = x"yes"; then
+    AC_DEFINE([WITH_NVTX],[1],[enable NVTX support])
+    AC_LANG_PUSH([C])
+    AC_SEARCH_LIBS([nvtxRangePop],[nvToolsExt],[have_nvtoolsext=yes],[have_nvtoolsext=no])
+    if test x"${have_nvtoolsext}" = x"no"; then
+      AC_MSG_ERROR([Could not link nvToolsExt; try to set the cuda-path or disable GPU support ])
+    fi
+    AC_LANG_POP([C])
+  fi
 else
-  AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)])
+  ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
+  ELPA_2STAGE_REAL_GPU_COMPILED=0
 fi
+AC_SUBST([ELPA_2STAGE_COMPLEX_GPU_COMPILED])
+AC_SUBST([ELPA_2STAGE_REAL_GPU_COMPILED])
 
 LT_INIT
 
@@ -825,24 +1359,159 @@
 DX_HTML_FEATURE(ON)
 DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs])
 
-DESPERATELY_WANT_ASSUMED_SIZE=0
-if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then
-  AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable])
+AC_MSG_CHECKING(whether assumed size Fortran arrays should be used)
+AC_ARG_ENABLE([assumed-size],
+                AS_HELP_STRING([--disable-assumed-size],
+                               [do NOT use assumed-size Fortran arrays]),
+                               [if test x"$enableval" = x"yes"; then
+                                  USE_ASSUMED_SIZE=yes
+                                else
+                                  USE_ASSUMED_SIZE=no
+                                fi],
+              [USE_ASSUMED_SIZE=yes])
+AC_MSG_RESULT([${USE_ASSUMED_SIZE}])
+AM_CONDITIONAL([WITH_USE_ASSUMED_SIZE],[test x"$USE_ASSUMED_SIZE" = x"yes"])
+if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
+  AC_DEFINE([USE_ASSUMED_SIZE],[1],[for performance reasons use assumed size Fortran arrays, even if not debuggable])
+fi
+
+AC_MSG_CHECKING(whether Fortran2008 features should be enabled)
+AC_ARG_ENABLE([Fortran2008-features],
+              AS_HELP_STRING([--enable-Fortran2008-features],
+                             [enables some Fortran 2008 features, default yes.]),
+			     [
+			      if test x"$enableval" = x"yes"; then
+			        enable_fortran2008_features=yes
+		              else
+			        enable_fortran2008_features=no
+			      fi
+			      ],
+              [enable_fortran2008_features=yes])
+AC_MSG_RESULT([${enable_fortran2008_features}])
+AM_CONDITIONAL([USE_FORTRAN2008],[test x"$enable_fortran2008_features" = x"yes"])
+if test x"${enable_fortran2008_features}" = x"yes"; then
+ AC_DEFINE([USE_FORTRAN2008], [1], [use some Fortran 2008 features])
+fi
+
+AC_MSG_CHECKING(whether autotuning functionality should be enabled)
+AC_ARG_ENABLE([autotuning],
+              AS_HELP_STRING([--enable-autotuning],
+                             [enables autotuning functionality, default yes.]),
+			     [
+			      if test x"$enableval" = x"yes"; then
+			        enable_autotuning=yes
+		              else
+			        enable_autotuning=no
+			      fi
+			      ],
+              [enable_autotuning=yes])
+AC_MSG_RESULT([${enable_autotuning}])
+AM_CONDITIONAL([ENABLE_AUTOTUNING],[test x"$enable_autotuning" = x"yes"])
+if test x"${enable_autotuning}" = x"yes"; then
+ AC_DEFINE([ENABLE_AUTOTUNING], [1], [enable autotuning functionality])
+fi
+
+AC_MSG_CHECKING(whether C tests should be provided)
+AC_ARG_ENABLE([c-tests],
+              AS_HELP_STRING([--enable-c-tests],
+                             [enables the C tests for elpa, default yes.]),
+			     [
+			      if test x"$enableval" = x"yes"; then
+			        enable_c_tests=yes
+		              else
+			        enable_c_tests=no
+			      fi
+			      ],
+              [enable_c_tests=yes])
+AC_MSG_RESULT([${enable_c_tests}])
+AM_CONDITIONAL([ENABLE_C_TESTS],[test x"$enable_c_tests" = x"yes"])
+if test x"${enable_c_tests}" = x"yes"; then
+ AC_DEFINE([ENABLE_C_TESTS], [1], [enable C tests])
+fi
+
+AC_MSG_CHECKING(whether we build for K-computer)
+AC_ARG_ENABLE([K-computer],
+              AS_HELP_STRING([--enable-K-computer],
+                             [enable builds on K-Computer, default no.]),
+			     [if test x"$enableval" = x"yes"; then
+			        enable_kcomputer=yes
+		              else
+				enable_kcomputer=no
+			      fi],
+              [enable_kcomputer=no])
+AC_MSG_RESULT([${enable_kcomputer}])
+AM_CONDITIONAL([BUILD_KCOMPUTER],[test x"$enable_kcomputer" = x"yes"])
+if test x"${enable_kcomputer}" = x"yes"; then
+ AC_DEFINE([BUILD_KCOMPUTER], [1], [build for K-Computer])
+ FC_MODINC="-I"
+  if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
+    AC_MSG_ERROR(on K-computer you have to switch off assumed-size arrays!)
+  fi
+  if test x"${enable_fortran2008_features}" = x"yes" ; then
+    AC_MSG_ERROR(on K-computer you have to switch off Fortran 2008 features!)
+  fi
+fi
+
+AC_MSG_CHECKING(whether we build for NEC SX-Auroa)
+AC_ARG_ENABLE([SX-Aurora],
+              AS_HELP_STRING([--enable-SX-Aurora],
+                             [enable builds on SX-Aurora, default no.]),
+			     [if test x"$enableval" = x"yes"; then
+			        enable_sxaurora=yes
+		              else
+				enable_sxaurora=no
+			      fi],
+              [enable_kcomputer=no])
+AC_MSG_RESULT([${enable_sxaurora}])
+AM_CONDITIONAL([BUILD_KCOMPUTER],[test x"$enable_sxaurora" = x"yes"])
+if test x"${enable_sxaurora}" = x"yes"; then
+ AC_DEFINE([BUILD_SXAURORA], [1], [build for SX-Aurora])
+ FC_MODINC="-I"
+  #if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
+  #  AC_MSG_ERROR(on K-computer you have to switch off assumed-size arrays!)
+  #fi
+  if test x"${enable_fortran2008_features}" = x"yes" ; then
+    AC_MSG_ERROR(on SX-Aurora you have to switch off Fortran 2008 features!)
+  fi
 fi
 
+if test x"${want_single_precision}" = x"yes" ; then
+  AC_DEFINE([WANT_SINGLE_PRECISION_REAL],[1],[build also single-precision for real calculation])
+  AC_DEFINE([WANT_SINGLE_PRECISION_COMPLEX],[1],[build also single-precision for complex calculation])
+fi
+AM_CONDITIONAL([WANT_SINGLE_PRECISION_REAL],[test x"$want_single_precision" = x"yes"])
+AM_CONDITIONAL([WANT_SINGLE_PRECISION_COMPLEX],[test x"$want_single_precision" = x"yes"])
+
+#always define SKEWSYMMETRIC for the moment
+
+AC_MSG_CHECKING(whether we should enable skew-symmetric support)
+AC_ARG_ENABLE([skew-symmetric-support],
+              AS_HELP_STRING([--enable-skew-symmetric-support],
+                             [enable support for real valued skew-symmetric matrices]),
+			     [if test x"$enableval" = x"yes"; then
+			        enable_skewsymmetric=yes
+		              else
+				enable_skewsymmetric=no
+			      fi],
+              [enable_skewsymmetric=no])
+AC_MSG_RESULT([${enable_skewsymmetric}])
+AM_CONDITIONAL([HAVE_SKEWSYMMETRIC],[test x"$enable_skewsymmetric" = x"yes"])
+if test x"${enable_skewsymmetric}" = x"yes"; then
+  AC_DEFINE([HAVE_SKEWSYMMETRIC],[1],[build for skewsyemmtric case])
+fi
+
+AC_SUBST([MPI_BINARY])
 AC_SUBST([WITH_MKL])
 AC_SUBST([WITH_BLACS])
-AC_SUBST([with_amd_bulldozer_kernel])
 AC_SUBST([FC_MODINC])
 AC_SUBST([FC_MODOUT])
 AC_SUBST([OPENMP_CFLAGS])
 AC_SUBST([OPENMP_FCFLAGS])
 AC_SUBST([OPENMP_LDFLAGS])
-#AC_SUBST(OPT_FCFLAGS)
 AC_SUBST([DOXYGEN_OUTPUT_DIR], [docs])
 
-rm -rf modules/ .fortran_dependencies/
-mkdir modules
+mkdir -p modules private_modules test_modules
+
 
 #gl_VISIBILITY
 #AH_BOTTOM([#if HAVE_VISIBILITY
@@ -863,11 +1532,44 @@
 # into "postdeps_FC" and causes linking errors later on.
 postdeps_FC=$(echo $postdeps_FC | sed 's/-l //g')
 
-if test x"${enable_openmp}" = x"yes"; then
-	SUFFIX="_openmp"
+if test x"${with_mpi}" = x"yes"; then
+  if test x"${enable_openmp}" = x"yes"; then
+        SUFFIX="_openmp"
+  else
+        SUFFIX=""
+  fi
+else
+  if test x"${enable_openmp}" = x"yes"; then
+        SUFFIX="_onenode_openmp"
+  else
+        SUFFIX="_onenode"
+  fi
+fi
+
+dnl store-build-config
+echo "checking whether build config should be compiled into the library..."
+AC_CHECK_PROG(xxd_CHECK,xxd,yes)
+AS_IF([test x"$xxd_CHECK" != x"yes"], [AC_MSG_ERROR([Please install xxd before configuring.])])
+AC_ARG_ENABLE([store-build-config],
+              AS_HELP_STRING([--enable-store-build-config],
+                             [compile build config into the library object, default no]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         store_build_config=yes
+	       else
+	         store_build_config=no
+	       fi
+	       ],
+              [store_build_config=no])
+AM_CONDITIONAL([STORE_BUILD_CONFIG],[test x"$store_build_config" = x"yes"])
+if test x"${store_build_config}" = x"yes"; then
+	echo "build config should be compiled into the library: yes"
+        AC_DEFINE([STORE_BUILD_CONFIG], [1], [compile build config into the library object])
 else
-	SUFFIX=""
+	echo "build config should be compiled into the library: no"
 fi
+
+
 AC_SUBST([SUFFIX])
 AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc])
 
@@ -875,19 +1577,212 @@
   Makefile
   Doxyfile
   ${PKG_CONFIG_FILE}:elpa.pc.in
+  elpa/elpa_constants.h
+  elpa/elpa_version.h
+  elpa/elpa_build_config.h
 ])
 
+m4_include([m4/ax_fc_check_define.m4])
+AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
+
+need_manual_cpp=no
+AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
+AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
+ACTUAL_FC="$FC"
+AC_SUBST([ACTUAL_FC])
+
+if test x"$is_intel" = x"yes" ; then
+	need_manual_cpp=yes
+fi
+if test x"$is_pgi" = x"yes" ; then
+	need_manual_cpp=yes
+fi
+
+if test x"$need_manual_cpp" = x"yes" ; then
+        AC_MSG_RESULT([yes])
+        FC="\$(top_srcdir)/manual_cpp $FC"
+else
+        AC_MSG_RESULT([no])
+fi
+
+if test x"$is_pgi" = x"yes" ; then
+	AC_DEFINE([PGI_VARIABLE_STRING_BUG], [1], [Work around a PGI bug with variable-length string results])
+fi
+
+
+dnl PGI compiler uses -module to specify module output
+dnl directory. This clashes with libtools -module link option
+dnl => escape it for libtool with -Xcompiler -module
+AC_MSG_CHECKING([whether we have to escape '-module' for libtool])
+if test x"$FC_MODOUT" = x'-module ' ; then
+        FC_MODOUT="-Xcompiler $FC_MODOUT -Xcompiler \$(ac_empty)"
+        FC="\$(top_srcdir)/remove_xcompiler $FC"
+        AC_MSG_RESULT([yes])
+else
+        AC_MSG_RESULT([no])
+fi
+
+AC_MSG_CHECKING(whether --enable-python is specified)
+AC_ARG_ENABLE([python],
+              AS_HELP_STRING([--enable-python],
+                             [build and install python wrapper, default no.]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         enable_python=yes
+	       else
+	         enable_python=no
+	       fi
+	       ],
+              [enable_python=no])
+AC_MSG_RESULT([${enable_python}])
+AM_CONDITIONAL([WITH_PYTHON],[test x"$enable_python" = x"yes"])
+if test x"${enable_python}" = x"yes"; then
+        AC_DEFINE([WITH_PYTHON], [1], [build and install python wrapper])
+        # check for python and dependencies
+        AM_PATH_PYTHON([3.6])
+        AC_ARG_VAR([PYTHON_INCLUDE], [Include flags for python, bypassing python-config])
+        AC_ARG_VAR([PYTHON_CONFIG], [Path to python-config])
+        AS_IF([test -z "$PYTHON_INCLUDE"], [
+          AS_IF([test -z "$PYTHON_CONFIG"], [
+            AC_PATH_PROGS([PYTHON_CONFIG],
+                          [python$PYTHON_VERSION-config python-config],
+                          [no],
+                          [`dirname $PYTHON`])
+            AS_IF([test "$PYTHON_CONFIG" = no], [AC_MSG_ERROR([cannot find python-config for $PYTHON.])])
+          ])
+          AC_MSG_CHECKING([python include flags])
+          PYTHON_INCLUDE=`$PYTHON_CONFIG --includes`
+          AC_MSG_RESULT([$PYTHON_INCLUDE])
+        ])
+        AC_MSG_CHECKING([numpy module])
+        AS_IF([$PYTHON -c "import numpy"], [AC_MSG_RESULT([found.])],
+              [AC_MSG_ERROR([cannot find numpy.])])
+        AC_MSG_CHECKING([mpi4py module])
+        AS_IF([$PYTHON -c "import mpi4py"], [AC_MSG_RESULT([found.])],
+              [AC_MSG_ERROR([cannot find mpi4py.])])
+        AC_MSG_CHECKING([cython module])
+        AS_IF([$PYTHON -c "import cython"], [AC_MSG_RESULT([found.])],
+              [AC_MSG_ERROR([cannot find cython.])])
+        AC_CHECK_PROG([cython_found], [cython], [yes], [no])
+        if test x"$cython_found" != x"yes" ; then
+          AC_MSG_ERROR([cython not found.])
+        fi
+        AC_ARG_VAR([NUMPY_INCLUDE], [Include flags for numpy])
+        AC_MSG_CHECKING([numpy include flags])
+        NUMPY_INCLUDE=-I`$PYTHON -c "import numpy; print(numpy.get_include())"`
+        AS_IF([test "$NUMPY_INCLUDE" = "-I"], [AC_MSG_ERROR([cannot get numpy include path.])])
+        AC_MSG_RESULT([$NUMPY_INCLUDE])
+fi
+AC_MSG_CHECKING(whether --enable-python-tests is specified)
+AC_ARG_ENABLE([python-tests],
+              AS_HELP_STRING([--enable-python-tests],
+                             [enable python tests, default no.]),
+              [
+	       if test x"$enableval" = x"yes"; then
+	         enable_python_tests=yes
+	       else
+	         enable_python_tests=no
+	       fi
+	       ],
+              [enable_python_tests=no])
+AC_MSG_RESULT([${enable_python_tests}])
+AM_CONDITIONAL([WITH_PYTHON_TESTS],[test x"$enable_python_tests" = x"yes"])
+if test x"${enable_python_tests}" = x"yes"; then
+        if test x"${enable_python}" = x"no"; then
+          AC_MSG_ERROR([Python tests can only be enabled it python is enabled.])
+        fi
+        AC_CHECK_PROG([pytest_found], [pytest], [yes], [no])
+        if test x"$pytest_found" != x"yes" ; then
+          AC_MSG_ERROR([pytest not found.])
+        fi
+fi
 AC_OUTPUT
 
-if test "${can_compile_avx}" = "no" ; then
-#  if test x"${want_avx}" = x"yes" ; then
-    AC_MSG_WARN([Could not compile AVX instructions])
-#  fi
-fi
-if test "${can_compile_avx2}" = "no" ; then
-#  if test x"${want_avx2}" = x"yes" ; then
-    AC_MSG_WARN([Could not compile AVX2 instructions])
-#  fi
+echo ""
+echo "The following ELPA2 kernels will be build:"
+echo ""
+m4_foreach_w([elpa_m4_kind],[real complex],[
+        m4_foreach_w([elpa_m4_kernel],m4_expand(elpa_m4_[]elpa_m4_kind[]_kernels),[
+                if test x"$use_[]elpa_m4_kernel" = x"yes" ; then
+                        echo -n "  elpa_m4_kernel"
+                        if test "$fixed_]elpa_m4_kind[_kernel" = "]elpa_m4_kernel[" ; then
+                                echo -n " (selected as fixed kernel)"
+                        fi
+                        if test "$default_]elpa_m4_kind[_kernel" = "]elpa_m4_kernel[" ; then
+                                echo -n " (default)"
+                        fi
+                        echo ""
+                fi
+        ])
+])
+
+if test x"${ax_cv_have_sse3_cpu_ext}" = x"yes" -a  x"${need_sse}" = x"no"; then
+  echo " "
+  AC_MSG_WARN([You did not request SSE support (--enable-sse), but your local CPU supports it.])
+  AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
+  echo " "
+fi
+
+if test x"${ax_cv_have_sse3_cpu_ext}" = x"yes" -a  x"${need_sse_assembly}" = x"no"; then
+  echo " "
+  AC_MSG_WARN([You did not request SSE-ASSEMBLY support (--enable-sse-assembly), but your local CPU supports it.])
+  AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
+  echo " "
+fi
+
+if test x"${ax_cv_have_avx_cpu_ext}" = x"yes" -a  x"${need_avx}" = x"no"; then
+  echo " "
+  AC_MSG_WARN([You did not request AVX support (--enable-avx), but your local CPU supports it.])
+  AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
+  echo " "
+fi
+
+if test x"${ax_cv_have_avx2_cpu_ext}" = x"yes" -a  x"${need_avx2}" = x"no"; then
+  echo " "
+  AC_MSG_WARN([You did not request AVX2 support (--enable-avx2), but your local CPU supports it.])
+  AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
+  echo " "
+fi
+
+if test x"${ax_cv_have_avx512f_cpu_ext}" = x"yes" -a  x"${need_avx512}" = x"no"; then
+  echo " "
+  AC_MSG_WARN([You did not request AVX512 support (--enable-avx512), but your local CPU supports it.])
+  AC_MSG_WARN([You might want to re-configure, except you are cross-compiling])
+  echo " "
+fi
+
+echo " "
+echo "***********************************************************************"
+echo "*  As announced, with this release candidate ELPA 2019.11.001,    *"
+echo "*  the legacy API has been finally removed !                          *"
+echo "***********************************************************************"
+echo " "
+#echo " "
+#echo "***********************************************************************"
+#echo "*  This is a the first release candidate of ELPA 2019.11.001      *"
+#echo "*  There might be still some changes until the final release of       *"
+#echo "*  ELPA 2019.11.001                                                   *"
+#echo "***********************************************************************"
+#echo " "
+
+if test x"$enable_kcomputer" = x"yes" ; then
+  echo " "
+  echo "Important message:"
+  echo "On K-computer (at the moment) the automatic creation of the generated"
+  echo "headers does not work."
+  echo "call: make -f ../generated_headers.am generated-headers top_srcdir=.."
+  echo "BEFORE triggering the build with make!"
+else
+  if test x"$optional_c_error_argument" = x"yes" ; then
+    echo "#define OPTIONAL_C_ERROR_ARGUMENT" > elpa/elpa_generated_c_api.h
+  else
+    echo "#undef OPTIONAL_C_ERROR_ARGUMENT" > elpa/elpa_generated_c_api.h
+  fi
+  if test x"$store_build_config" = x"yes"; then
+    cat config.log > elpa_build_object
+    xxd -i elpa_build_object >> elpa/elpa_build_config.h
+  fi
+
+  make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
 fi
 
-make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir"
diff -Nru elpa-2016.05.001/CONTRIBUTING.md elpa-2019.11.001/CONTRIBUTING.md
--- elpa-2016.05.001/CONTRIBUTING.md	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/CONTRIBUTING.md	2019-12-19 09:47:43.000000000 +0000
@@ -1,15 +1,16 @@
-How to contribute to the ELPA library:
+### How to contribute to the *ELPA* library: ###
 
-We are very happy and gratefull if you are willing to help us improve ELPA. Thus, we would like to make this process as simple as possible for you, 
+We are very happy and gratefull if you are willing to help us improve *ELPA*.
+Thus, we would like to make this process as simple as possible for you,
 but at the same time still keep it manageable for us
 
-For recommendations and suggestions, a simple email to us is sufficient!
+For recommendations and suggestions, a simple email to us (*elpa-libray at mpcdf.mpg.de*) is sufficient!
 
 If you would like to share with us your improvements, we suggest the following ways:
 
-1. If you use a public accessible git repository, please send us a merge request. This is the preferred way
-2. An email with a patch, will also be ok.
+1. If you use a public accessible git repository, please send us a merge request. This is the preferred way.
+2. An email with a patch, will also be ok. Please use *elpa-library at mpcdf.mpg.de*
 
-Thank you for supporting ELPA!
+Thank you for supporting *ELPA*!
 
 The ELPA development team
diff -Nru elpa-2016.05.001/debian/changelog elpa-2019.11.001/debian/changelog
--- elpa-2016.05.001/debian/changelog	2020-03-22 15:39:00.000000000 +0000
+++ elpa-2019.11.001/debian/changelog	2020-10-03 10:45:18.000000000 +0000
@@ -1,8 +1,46 @@
-elpa (2016.05.001-6build1) focal; urgency=medium
+elpa (2019.11.001-4) unstable; urgency=medium
 
-  * No-change rebuild for libgcc-s1 package name change.
+  * debian/rules (DEB_FCFLAGS_MAINT_APPEND): New variable, appends
+    -fallow-argument-mismatch-fallow-argument-mismatch to the gfortran flags.
+    (Closes: #957170)
 
- -- Matthias Klose <doko@ubuntu.com>  Sun, 22 Mar 2020 16:39:00 +0100
+ -- Michael Banck <mbanck@debian.org>  Sat, 03 Oct 2020 12:45:18 +0200
+
+elpa (2019.11.001-3) unstable; urgency=medium
+
+  * Upload ot unstable.
+
+ -- Michael Banck <mbanck@debian.org>  Sat, 04 Jul 2020 18:36:58 +0200
+
+elpa (2019.11.001-2) experimental; urgency=medium
+
+  * debian/rules (BUILD_FLAGS): Add --disable-sse-assembly.
+  * debian/libelpa15.symbols: Updated.
+
+ -- Michael Banck <mbanck@debian.org>  Sat, 04 Jan 2020 17:49:33 +0100
+
+elpa (2019.11.001-1) experimental; urgency=medium
+
+  * New upstream release.
+  * debian/patches/fix_module_dir.patch: Updated.
+  * debian/patches/testsuite_force_default_parameters.patch: Removed, no longer
+    needed.
+  * debian/patches/testsuite_custom_mpiexec.patch: Likewise.
+  * debian/patches/configure_fix_am_conditional.patch: Likewise.
+  * debian/control (Build-Depends): Added xxd.
+  * debian/rules (TEST_FLAGS): Use current upstream default parameters.
+  * debian/libelpa4.install: Renamed to ...
+  * debian/libelpa15.install: ... this.
+  * debian/libelpa4.symbols: Renamed to ...
+  * debian/libelpa15.symbols: ... this, and updated.
+  * debian/control: Rename libelpa4 to libelpa15.
+  * debian/libelpa-dev.docs: New file, include USERS_GUIDE.md.
+  * debian/rules: Add --max-parallel=1 and --builddir=build to dh.
+  * debian/rules (override_dh_auto_test): Updated.
+  * debian/rules (BUILD_FLAGS): New variable, disabling SSE and AVX.
+  * debian/rules (override_dh_auto_configure): Use it.
+
+ -- Michael Banck <mbanck@debian.org>  Wed, 01 Jan 2020 21:47:36 +0100
 
 elpa (2016.05.001-6) unstable; urgency=medium
 
diff -Nru elpa-2016.05.001/debian/control elpa-2019.11.001/debian/control
--- elpa-2016.05.001/debian/control	2018-06-24 19:19:52.000000000 +0000
+++ elpa-2019.11.001/debian/control	2020-01-01 15:16:05.000000000 +0000
@@ -10,13 +10,14 @@
                libblas-dev,
                liblapack-dev,
                libscalapack-mpi-dev (>= 2),
-               mpi-default-dev
+               mpi-default-dev,
+               xxd
 Standards-Version: 4.1.0
 Homepage: http://elpa.mpcdf.mpg.de/
 Vcs-Browser: https://salsa.debian.org/debichem-team/elpa
 Vcs-Git: https://salsa.debian.org/debichem-team/elpa.git
 
-Package: libelpa4
+Package: libelpa15
 Architecture: any
 Section: libs
 Depends: ${misc:Depends}, ${shlibs:Depends}
@@ -26,7 +27,7 @@
 Package: libelpa-dev
 Architecture: any
 Section: libdevel
-Depends: libelpa4 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
+Depends: libelpa15 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
 Description: Eigenvalue SoLvers for Petaflop-Applications (Development version)
  ELPA is Eigenvalue SoLvers for Petaflop-Applications.
  .
diff -Nru elpa-2016.05.001/debian/libelpa15.install elpa-2019.11.001/debian/libelpa15.install
--- elpa-2016.05.001/debian/libelpa15.install	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/debian/libelpa15.install	2018-06-24 09:48:43.000000000 +0000
@@ -0,0 +1 @@
+usr/lib/*/libelpa.so.*
diff -Nru elpa-2016.05.001/debian/libelpa15.symbols elpa-2019.11.001/debian/libelpa15.symbols
--- elpa-2016.05.001/debian/libelpa15.symbols	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/debian/libelpa15.symbols	2020-01-04 16:48:13.000000000 +0000
@@ -0,0 +1,405 @@
+libelpa.so.15 libelpa15 #MINVER#
+ DEFAULT_PARAMETERS@Base 2019.11.001
+ EXPLICIT_PARAMETERS@Base 2019.11.001
+ LEN@Base 2019.11.001
+ STRUCTURE_PARAMETERS@Base 2019.11.001
+ __compute_hh_trafo_MOD_compute_hh_trafo_complex_double@Base 2019.11.001
+ __compute_hh_trafo_MOD_compute_hh_trafo_real_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_compute_hh_dotp_gpu_kernel_complex_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_compute_hh_dotp_gpu_kernel_real_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_compute_hh_trafo_gpu_kernel_complex_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_compute_hh_trafo_gpu_kernel_real_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_extract_hh_tau_gpu_kernel_complex_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_extract_hh_tau_gpu_kernel_real_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_my_pack_gpu_kernel_complex_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_my_pack_gpu_kernel_real_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_my_unpack_gpu_kernel_complex_double@Base 2019.11.001
+ __cuda_c_kernel_MOD_launch_my_unpack_gpu_kernel_real_double@Base 2019.11.001
+ __cuda_functions_MOD_cublas_cgemm@Base 2019.11.001
+ __cuda_functions_MOD_cublas_cgemv@Base 2019.11.001
+ __cuda_functions_MOD_cublas_create@Base 2019.11.001
+ __cuda_functions_MOD_cublas_ctrmm@Base 2019.11.001
+ __cuda_functions_MOD_cublas_destroy@Base 2019.11.001
+ __cuda_functions_MOD_cublas_dgemm@Base 2019.11.001
+ __cuda_functions_MOD_cublas_dgemv@Base 2019.11.001
+ __cuda_functions_MOD_cublas_dtrmm@Base 2019.11.001
+ __cuda_functions_MOD_cublas_sgemm@Base 2019.11.001
+ __cuda_functions_MOD_cublas_sgemv@Base 2019.11.001
+ __cuda_functions_MOD_cublas_strmm@Base 2019.11.001
+ __cuda_functions_MOD_cublas_zgemm@Base 2019.11.001
+ __cuda_functions_MOD_cublas_zgemv@Base 2019.11.001
+ __cuda_functions_MOD_cublas_ztrmm@Base 2019.11.001
+ __cuda_functions_MOD_cublashandle@Base 2019.11.001
+ __cuda_functions_MOD_cuda_devicesynchronize@Base 2019.11.001
+ __cuda_functions_MOD_cuda_free@Base 2019.11.001
+ __cuda_functions_MOD_cuda_getdevicecount@Base 2019.11.001
+ __cuda_functions_MOD_cuda_hostregistermapped@Base 2019.11.001
+ __cuda_functions_MOD_cuda_hostregisterportable@Base 2019.11.001
+ __cuda_functions_MOD_cuda_malloc@Base 2019.11.001
+ __cuda_functions_MOD_cuda_memcpy2d@Base 2019.11.001
+ __cuda_functions_MOD_cuda_memcpy@Base 2019.11.001
+ __cuda_functions_MOD_cuda_memcpydevicetodevice@Base 2019.11.001
+ __cuda_functions_MOD_cuda_memcpydevicetohost@Base 2019.11.001
+ __cuda_functions_MOD_cuda_memcpyhosttodevice@Base 2019.11.001
+ __cuda_functions_MOD_cuda_memset@Base 2019.11.001
+ __cuda_functions_MOD_cuda_setdevice@Base 2019.11.001
+ __cuda_functions_MOD_cuda_threadsynchronize@Base 2019.11.001
+ __cuda_functions_MOD_cudahostregistermapped@Base 2019.11.001
+ __cuda_functions_MOD_cudahostregisterportable@Base 2019.11.001
+ __cuda_functions_MOD_cudamemcpydevicetodevice@Base 2019.11.001
+ __cuda_functions_MOD_cudamemcpydevicetohost@Base 2019.11.001
+ __cuda_functions_MOD_cudamemcpyhosttodevice@Base 2019.11.001
+ __elpa1_auxiliary_impl_MOD_elpa_cholesky_complex_double_impl@Base 2019.11.001
+ __elpa1_auxiliary_impl_MOD_elpa_cholesky_real_double_impl@Base 2019.11.001
+ __elpa1_auxiliary_impl_MOD_elpa_invert_trm_complex_double_impl@Base 2019.11.001
+ __elpa1_auxiliary_impl_MOD_elpa_invert_trm_real_double_impl@Base 2019.11.001
+ __elpa1_auxiliary_impl_MOD_elpa_mult_ah_b_complex_double_impl@Base 2019.11.001
+ __elpa1_auxiliary_impl_MOD_elpa_mult_at_b_real_double_impl@Base 2019.11.001
+ __elpa1_auxiliary_impl_MOD_elpa_solve_tridi_double_impl@Base 2019.11.001
+ __elpa1_compute_MOD_elpa_reduce_add_vectors_complex_double@Base 2019.11.001
+ __elpa1_compute_MOD_elpa_reduce_add_vectors_real_double@Base 2019.11.001
+ __elpa1_compute_MOD_elpa_transpose_vectors_complex_double@Base 2019.11.001
+ __elpa1_compute_MOD_elpa_transpose_vectors_real_double@Base 2019.11.001
+ __elpa1_compute_MOD_elpa_transpose_vectors_ss_complex_double@Base 2019.11.001
+ __elpa1_compute_MOD_elpa_transpose_vectors_ss_real_double@Base 2019.11.001
+ __elpa1_compute_MOD_hh_transform_complex_double@Base 2019.11.001
+ __elpa1_compute_MOD_hh_transform_real_double@Base 2019.11.001
+ __elpa1_compute_MOD_solve_tridi_double@Base 2019.11.001
+ __elpa1_compute_MOD_solve_tridi_double_impl@Base 2019.11.001
+ __elpa1_compute_MOD_trans_ev_complex_double@Base 2019.11.001
+ __elpa1_compute_MOD_trans_ev_real_double@Base 2019.11.001
+ __elpa1_compute_MOD_tridiag_complex_double@Base 2019.11.001
+ __elpa1_compute_MOD_tridiag_real_double@Base 2019.11.001
+ __elpa1_impl_MOD_elpa_solve_evp_complex_1stage_double_impl@Base 2019.11.001
+ __elpa1_impl_MOD_elpa_solve_evp_real_1stage_double_impl@Base 2019.11.001
+ __elpa2_compute_MOD_band_band_real_double@Base 2019.11.001
+ __elpa2_compute_MOD_bandred_complex_double@Base 2019.11.001
+ __elpa2_compute_MOD_bandred_real_double@Base 2019.11.001
+ __elpa2_compute_MOD_trans_ev_band_to_full_complex_double@Base 2019.11.001
+ __elpa2_compute_MOD_trans_ev_band_to_full_real_double@Base 2019.11.001
+ __elpa2_compute_MOD_trans_ev_tridi_to_band_complex_double@Base 2019.11.001
+ __elpa2_compute_MOD_trans_ev_tridi_to_band_real_double@Base 2019.11.001
+ __elpa2_compute_MOD_tridiag_band_complex_double@Base 2019.11.001
+ __elpa2_compute_MOD_tridiag_band_real_double@Base 2019.11.001
+ __elpa2_compute_MOD_which_qr_decomposition@Base 2019.11.001
+ __elpa2_impl_MOD_elpa_solve_evp_complex_2stage_double_impl@Base 2019.11.001
+ __elpa2_impl_MOD_elpa_solve_evp_real_2stage_double_impl@Base 2019.11.001
+ __elpa2_workload_MOD_determine_workload@Base 2019.11.001
+ __elpa2_workload_MOD_divide_band@Base 2019.11.001
+ __elpa_MOD_elpa_allocate@Base 2019.11.001
+ __elpa_MOD_elpa_autotune_deallocate@Base 2019.11.001
+ __elpa_MOD_elpa_deallocate@Base 2019.11.001
+ __elpa_abstract_impl_MOD___vtab_elpa_abstract_impl_Elpa_abstract_impl_t@Base 2019.11.001
+ __elpa_abstract_impl_MOD_elpa_get_double@Base 2019.11.001
+ __elpa_abstract_impl_MOD_elpa_get_integer@Base 2019.11.001
+ __elpa_abstract_impl_MOD_elpa_set_double@Base 2019.11.001
+ __elpa_abstract_impl_MOD_elpa_set_integer@Base 2019.11.001
+ __elpa_api_MOD___vtab_elpa_api_Elpa_autotune_t@Base 2019.11.001
+ __elpa_api_MOD___vtab_elpa_api_Elpa_t@Base 2019.11.001
+ __elpa_api_MOD_elpa_c_string@Base 2019.11.001
+ __elpa_api_MOD_elpa_get_api_version@Base 2019.11.001
+ __elpa_api_MOD_elpa_initialized@Base 2019.11.001
+ __elpa_api_MOD_elpa_int_string_to_value@Base 2019.11.001
+ __elpa_api_MOD_elpa_int_value_to_string@Base 2019.11.001
+ __elpa_api_MOD_elpa_option_cardinality@Base 2019.11.001
+ __elpa_api_MOD_elpa_option_enumerate@Base 2019.11.001
+ __elpa_api_MOD_elpa_strerr@Base 2019.11.001
+ __elpa_api_MOD_elpa_uninit@Base 2019.11.001
+ __elpa_autotune_impl_MOD___copy_elpa_autotune_impl_Elpa_autotune_impl_t@Base 2019.11.001
+ __elpa_autotune_impl_MOD___def_init_elpa_autotune_impl_Elpa_autotune_impl_t@Base 2019.11.001
+ __elpa_autotune_impl_MOD___vtab_elpa_autotune_impl_Elpa_autotune_impl_t@Base 2019.11.001
+ __elpa_autotune_impl_MOD_elpa_autotune_destroy@Base 2019.11.001
+ __elpa_autotune_impl_MOD_elpa_autotune_print@Base 2019.11.001
+ __elpa_impl_MOD___copy_elpa_impl_Elpa_impl_t@Base 2019.11.001
+ __elpa_impl_MOD___def_init_elpa_impl_Elpa_impl_t@Base 2019.11.001
+ __elpa_impl_MOD___vtab_elpa_impl_Elpa_impl_t@Base 2019.11.001
+ __elpa_impl_MOD_elpa_associate_int@Base 2019.11.001
+ __elpa_impl_MOD_elpa_autotune_load_state@Base 2019.11.001
+ __elpa_impl_MOD_elpa_autotune_print_best@Base 2019.11.001
+ __elpa_impl_MOD_elpa_autotune_print_state@Base 2019.11.001
+ __elpa_impl_MOD_elpa_autotune_save_state@Base 2019.11.001
+ __elpa_impl_MOD_elpa_autotune_set_best@Base 2019.11.001
+ __elpa_impl_MOD_elpa_autotune_setup@Base 2019.11.001
+ __elpa_impl_MOD_elpa_autotune_step@Base 2019.11.001
+ __elpa_impl_MOD_elpa_can_set@Base 2019.11.001
+ __elpa_impl_MOD_elpa_cholesky_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_cholesky_dc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_cholesky_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_cholesky_fc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_construct_scalapack_descriptor@Base 2019.11.001
+ __elpa_impl_MOD_elpa_creating_from_legacy_api@Base 2019.11.001
+ __elpa_impl_MOD_elpa_destroy@Base 2019.11.001
+ __elpa_impl_MOD_elpa_eigenvalues_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_eigenvalues_dc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_eigenvalues_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_eigenvalues_fc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_eigenvectors_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_eigenvectors_dc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_eigenvectors_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_eigenvectors_fc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_generalized_eigenvalues_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_generalized_eigenvalues_dc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_generalized_eigenvalues_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_generalized_eigenvalues_fc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_generalized_eigenvectors_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_generalized_eigenvectors_dc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_generalized_eigenvectors_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_generalized_eigenvectors_fc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_get_time@Base 2019.11.001
+ __elpa_impl_MOD_elpa_hermitian_multiply_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_hermitian_multiply_dc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_hermitian_multiply_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_hermitian_multiply_fc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_impl_allocate@Base 2019.11.001
+ __elpa_impl_MOD_elpa_invert_trm_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_invert_trm_dc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_invert_trm_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_invert_trm_fc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_is_set@Base 2019.11.001
+ __elpa_impl_MOD_elpa_load_settings@Base 2019.11.001
+ __elpa_impl_MOD_elpa_print_settings@Base 2019.11.001
+ __elpa_impl_MOD_elpa_print_times@Base 2019.11.001
+ __elpa_impl_MOD_elpa_setup@Base 2019.11.001
+ __elpa_impl_MOD_elpa_skew_eigenvalues_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_skew_eigenvalues_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_skew_eigenvectors_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_skew_eigenvectors_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_solve_tridiagonal_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_solve_tridiagonal_f@Base 2019.11.001
+ __elpa_impl_MOD_elpa_store_settings@Base 2019.11.001
+ __elpa_impl_MOD_elpa_timer_start@Base 2019.11.001
+ __elpa_impl_MOD_elpa_timer_stop@Base 2019.11.001
+ __elpa_impl_MOD_elpa_transform_back_generalized_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_transform_back_generalized_dc@Base 2019.11.001
+ __elpa_impl_MOD_elpa_transform_generalized_d@Base 2019.11.001
+ __elpa_impl_MOD_elpa_transform_generalized_dc@Base 2019.11.001
+ __elpa_mpi_stubs_MOD_mpi_comm_rank@Base 2019.11.001
+ __elpa_mpi_stubs_MOD_mpi_comm_size@Base 2019.11.001
+ __elpa_mpi_stubs_MOD_mpi_comm_split@Base 2019.11.001
+ __elpa_mpi_stubs_MOD_mpi_wtime@Base 2019.11.001
+ __elpa_omp_MOD_omp_threads_caller@Base 2019.11.001
+ __elpa_pdgeqrf_MOD_qr_pdgeqrf_2dcomm_double@Base 2019.11.001
+ __elpa_pdgeqrf_MOD_qr_pdlarfg2_1dcomm_check_double@Base 2019.11.001
+ __elpa_pdgeqrf_MOD_qr_pqrparam_init@Base 2019.11.001
+ __elpa_pdlarfb_MOD_qr_pdlarfb_1dcomm_double@Base 2019.11.001
+ __elpa_pdlarfb_MOD_qr_pdlarfl2_tmatrix_1dcomm_double@Base 2019.11.001
+ __elpa_pdlarfb_MOD_qr_pdlarfl_1dcomm_double@Base 2019.11.001
+ __elpa_pdlarfb_MOD_qr_pdlarft_pdlarfb_1dcomm_double@Base 2019.11.001
+ __elpa_pdlarfb_MOD_qr_pdlarft_set_merge_1dcomm_double@Base 2019.11.001
+ __elpa_pdlarfb_MOD_qr_pdlarft_tree_merge_1dcomm_double@Base 2019.11.001
+ __elpa_pdlarfb_MOD_qr_tmerge_pdlarfb_1dcomm_double@Base 2019.11.001
+ __elpa_qrkernels_MOD_qr_dlarft_kernel_double@Base 2019.11.001
+ __elpa_qrkernels_MOD_qr_pdlarfb_kernel_local_double@Base 2019.11.001
+ __elpa_qrkernels_MOD_qr_pdlarft_merge_kernel_local_double@Base 2019.11.001
+ __elpa_qrkernels_MOD_qr_tmerge_set_kernel_double@Base 2019.11.001
+ __elpa_qrkernels_MOD_qr_tmerge_tree_kernel_double@Base 2019.11.001
+ __elpa_skewsymmetric_blas_MOD_elpa_dssmv@Base 2019.11.001
+ __elpa_skewsymmetric_blas_MOD_elpa_dssr2@Base 2019.11.001
+ __elpa_skewsymmetric_blas_MOD_elpa_zssmv@Base 2019.11.001
+ __elpa_skewsymmetric_blas_MOD_elpa_zssr2@Base 2019.11.001
+ __elpa_utilities_MOD_check_alloc@Base 2019.11.001
+ __elpa_utilities_MOD_check_alloc_cuda_f@Base 2019.11.001
+ __elpa_utilities_MOD_check_dealloc_cuda_f@Base 2019.11.001
+ __elpa_utilities_MOD_check_memcpy_cuda_f@Base 2019.11.001
+ __elpa_utilities_MOD_least_common_multiple@Base 2019.11.001
+ __elpa_utilities_MOD_local_index@Base 2019.11.001
+ __elpa_utilities_MOD_map_global_array_index_to_local_index@Base 2019.11.001
+ __elpa_utilities_MOD_pcol@Base 2019.11.001
+ __elpa_utilities_MOD_prow@Base 2019.11.001
+ __ftimings_MOD___copy_ftimings_Node_t@Base 2019.11.001
+ __ftimings_MOD___copy_ftimings_Timer_t@Base 2019.11.001
+ __ftimings_MOD___deallocate_ftimings_Node_t@Base 2019.11.001
+ __ftimings_MOD___def_init_ftimings_Node_t@Base 2019.11.001
+ __ftimings_MOD___def_init_ftimings_Timer_t@Base 2019.11.001
+ __ftimings_MOD___vtab_ftimings_Node_t@Base 2019.11.001
+ __ftimings_MOD___vtab_ftimings_Timer_t@Base 2019.11.001
+ __ftimings_MOD_node_get_child@Base 2019.11.001
+ __ftimings_MOD_node_get_value@Base 2019.11.001
+ __ftimings_MOD_node_new_child@Base 2019.11.001
+ __ftimings_MOD_node_now@Base 2019.11.001
+ __ftimings_MOD_node_print@Base 2019.11.001
+ __ftimings_MOD_node_print_graph@Base 2019.11.001
+ __ftimings_MOD_node_sort_children@Base 2019.11.001
+ __ftimings_MOD_node_start@Base 2019.11.001
+ __ftimings_MOD_node_stop@Base 2019.11.001
+ __ftimings_MOD_node_sum_of_children@Base 2019.11.001
+ __ftimings_MOD_node_sum_of_children_below@Base 2019.11.001
+ __ftimings_MOD_node_sum_of_children_with_name@Base 2019.11.001
+ __ftimings_MOD_timer_disable@Base 2019.11.001
+ __ftimings_MOD_timer_enable@Base 2019.11.001
+ __ftimings_MOD_timer_free@Base 2019.11.001
+ __ftimings_MOD_timer_get@Base 2019.11.001
+ __ftimings_MOD_timer_in_entries@Base 2019.11.001
+ __ftimings_MOD_timer_is_enabled@Base 2019.11.001
+ __ftimings_MOD_timer_measure_allocated_memory@Base 2019.11.001
+ __ftimings_MOD_timer_measure_flops@Base 2019.11.001
+ __ftimings_MOD_timer_measure_max_allocated_memory@Base 2019.11.001
+ __ftimings_MOD_timer_measure_memory_bandwidth@Base 2019.11.001
+ __ftimings_MOD_timer_measure_virtual_memory@Base 2019.11.001
+ __ftimings_MOD_timer_print@Base 2019.11.001
+ __ftimings_MOD_timer_set_print_options@Base 2019.11.001
+ __ftimings_MOD_timer_since@Base 2019.11.001
+ __ftimings_MOD_timer_sort@Base 2019.11.001
+ __ftimings_MOD_timer_start@Base 2019.11.001
+ __ftimings_MOD_timer_stop@Base 2019.11.001
+ __ftimings_value_MOD___copy_ftimings_value_Value_t@Base 2019.11.001
+ __ftimings_value_MOD___def_init_ftimings_value_Value_t@Base 2019.11.001
+ __ftimings_value_MOD___vtab_ftimings_value_Value_t@Base 2019.11.001
+ __ftimings_value_MOD_null_value@Base 2019.11.001
+ __ftimings_value_MOD_value_add@Base 2019.11.001
+ __ftimings_value_MOD_value_inverse@Base 2019.11.001
+ __ftimings_value_MOD_value_minus@Base 2019.11.001
+ __matrix_plot_MOD_prmat@Base 2019.11.001
+ __mod_check_for_gpu_MOD_check_for_gpu@Base 2019.11.001
+ __pack_unpack_cpu_MOD_pack_row_complex_cpu_double@Base 2019.11.001
+ __pack_unpack_cpu_MOD_pack_row_real_cpu_double@Base 2019.11.001
+ __pack_unpack_cpu_MOD_unpack_row_complex_cpu_double@Base 2019.11.001
+ __pack_unpack_cpu_MOD_unpack_row_real_cpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_compute_hh_dot_products_complex_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_compute_hh_dot_products_real_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_extract_hh_tau_complex_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_extract_hh_tau_real_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_pack_row_group_complex_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_pack_row_group_real_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_unpack_and_prepare_row_group_complex_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_unpack_and_prepare_row_group_real_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_unpack_row_group_complex_gpu_double@Base 2019.11.001
+ __pack_unpack_gpu_MOD_unpack_row_group_real_gpu_double@Base 2019.11.001
+ __qr_utils_mod_MOD_local_size_offset_1d@Base 2019.11.001
+ __qr_utils_mod_MOD_reverse_matrix_1dcomm_double@Base 2019.11.001
+ __qr_utils_mod_MOD_reverse_matrix_2dcomm_ref_double@Base 2019.11.001
+ __qr_utils_mod_MOD_reverse_matrix_local_double@Base 2019.11.001
+ __qr_utils_mod_MOD_reverse_vector_local_double@Base 2019.11.001
+ __redist_MOD_redist_band_complex_double@Base 2019.11.001
+ __redist_MOD_redist_band_real_double@Base 2019.11.001
+ __single_hh_trafo_real_MOD_single_hh_trafo_real_cpu_double@Base 2019.11.001
+ cannons_reduction_c_d@Base 2019.11.001
+ cannons_reduction_c_dc@Base 2019.11.001
+ cannons_reduction_c_f@Base 2019.11.001
+ cannons_reduction_c_fc@Base 2019.11.001
+ cannons_reduction_d@Base 2019.11.001
+ cannons_reduction_dc@Base 2019.11.001
+ cannons_reduction_f@Base 2019.11.001
+ cannons_reduction_fc@Base 2019.11.001
+ cannons_triang_rectangular_c_d@Base 2019.11.001
+ cannons_triang_rectangular_c_dc@Base 2019.11.001
+ cannons_triang_rectangular_c_f@Base 2019.11.001
+ cannons_triang_rectangular_c_fc@Base 2019.11.001
+ cannons_triang_rectangular_d@Base 2019.11.001
+ cannons_triang_rectangular_dc@Base 2019.11.001
+ cannons_triang_rectangular_f@Base 2019.11.001
+ cannons_triang_rectangular_fc@Base 2019.11.001
+ double_hh_trafo_complex_generic_double_@Base 2019.11.001
+ double_hh_trafo_complex_generic_simple_double_@Base 2019.11.001
+ double_hh_trafo_real_generic_double_@Base 2019.11.001
+ double_hh_trafo_real_generic_simple_double_@Base 2019.11.001
+ elpa_allocate@Base 2019.11.001
+ elpa_autotune_deallocate@Base 2019.11.001
+ elpa_autotune_load_state@Base 2019.11.001
+ elpa_autotune_print_best@Base 2019.11.001
+ elpa_autotune_print_state@Base 2019.11.001
+ elpa_autotune_save_state@Base 2019.11.001
+ elpa_autotune_set_best@Base 2019.11.001
+ elpa_autotune_setup@Base 2019.11.001
+ elpa_autotune_step@Base 2019.11.001
+ elpa_cholesky_d@Base 2019.11.001
+ elpa_cholesky_dc@Base 2019.11.001
+ elpa_cholesky_f@Base 2019.11.001
+ elpa_cholesky_fc@Base 2019.11.001
+ elpa_deallocate@Base 2019.11.001
+ elpa_eigenvalues_d@Base 2019.11.001
+ elpa_eigenvalues_dc@Base 2019.11.001
+ elpa_eigenvalues_f@Base 2019.11.001
+ elpa_eigenvalues_fc@Base 2019.11.001
+ elpa_eigenvectors_d@Base 2019.11.001
+ elpa_eigenvectors_dc@Base 2019.11.001
+ elpa_eigenvectors_f@Base 2019.11.001
+ elpa_eigenvectors_fc@Base 2019.11.001
+ elpa_generalized_eigenvalues_d@Base 2019.11.001
+ elpa_generalized_eigenvalues_dc@Base 2019.11.001
+ elpa_generalized_eigenvalues_f@Base 2019.11.001
+ elpa_generalized_eigenvalues_fc@Base 2019.11.001
+ elpa_generalized_eigenvectors_d@Base 2019.11.001
+ elpa_generalized_eigenvectors_dc@Base 2019.11.001
+ elpa_generalized_eigenvectors_f@Base 2019.11.001
+ elpa_generalized_eigenvectors_fc@Base 2019.11.001
+ elpa_get_double@Base 2019.11.001
+ elpa_get_integer@Base 2019.11.001
+ elpa_hermitian_multiply_d@Base 2019.11.001
+ elpa_hermitian_multiply_dc@Base 2019.11.001
+ elpa_hermitian_multiply_f@Base 2019.11.001
+ elpa_hermitian_multiply_fc@Base 2019.11.001
+ elpa_index_autotune_cardinality@Base 2019.11.001
+ elpa_index_double_value_is_set@Base 2019.11.001
+ elpa_index_free@Base 2019.11.001
+ elpa_index_get_double_loc@Base 2019.11.001
+ elpa_index_get_double_value@Base 2019.11.001
+ elpa_index_get_int_loc@Base 2019.11.001
+ elpa_index_get_int_value@Base 2019.11.001
+ elpa_index_instance@Base 2019.11.001
+ elpa_index_int_is_valid@Base 2019.11.001
+ elpa_index_int_value_is_set@Base 2019.11.001
+ elpa_index_int_value_to_strlen@Base 2019.11.001
+ elpa_index_is_printing_mpi_rank@Base 2019.11.001
+ elpa_index_load_autotune_state@Base 2019.11.001
+ elpa_index_load_settings@Base 2019.11.001
+ elpa_index_print_autotune_parameters@Base 2019.11.001
+ elpa_index_print_autotune_state@Base 2019.11.001
+ elpa_index_print_int_parameter@Base 2019.11.001
+ elpa_index_print_settings@Base 2019.11.001
+ elpa_index_set_autotune_parameters@Base 2019.11.001
+ elpa_index_set_double_value@Base 2019.11.001
+ elpa_index_set_from_load_double_value@Base 2019.11.001
+ elpa_index_set_from_load_int_value@Base 2019.11.001
+ elpa_index_set_int_value@Base 2019.11.001
+ elpa_index_value_is_set@Base 2019.11.001
+ elpa_init@Base 2019.11.001
+ elpa_int_string_to_value@Base 2019.11.001
+ elpa_int_value_to_string@Base 2019.11.001
+ elpa_int_value_to_strlen@Base 2019.11.001
+ elpa_invert_trm_d@Base 2019.11.001
+ elpa_invert_trm_dc@Base 2019.11.001
+ elpa_invert_trm_f@Base 2019.11.001
+ elpa_invert_trm_fc@Base 2019.11.001
+ elpa_load_settings@Base 2019.11.001
+ elpa_option_cardinality@Base 2019.11.001
+ elpa_option_enumerate@Base 2019.11.001
+ elpa_print_settings@Base 2019.11.001
+ elpa_set_double@Base 2019.11.001
+ elpa_set_integer@Base 2019.11.001
+ elpa_setup@Base 2019.11.001
+ elpa_skew_eigenvalues_d@Base 2019.11.001
+ elpa_skew_eigenvalues_f@Base 2019.11.001
+ elpa_skew_eigenvectors_d@Base 2019.11.001
+ elpa_skew_eigenvectors_f@Base 2019.11.001
+ elpa_store_settings@Base 2019.11.001
+ elpa_strerr@Base 2019.11.001
+ elpa_uninit@Base 2019.11.001
+ ftimings_highwater_mark@Base 2019.11.001
+ ftimings_microseconds_since_epoch@Base 2019.11.001
+ ftimings_resident_set_size@Base 2019.11.001
+ ftimings_virtual_memory@Base 2019.11.001
+ hexa_hh_trafo_real_generic_simple_6hv_double_@Base 2019.11.001
+ hh_trafo_complex_kernel_12_2hv_double_@Base 2019.11.001
+ hh_trafo_complex_kernel_12_double_@Base 2019.11.001
+ hh_trafo_complex_kernel_4_2hv_double_@Base 2019.11.001
+ hh_trafo_complex_kernel_4_double_@Base 2019.11.001
+ hh_trafo_complex_kernel_8_2hv_double_@Base 2019.11.001
+ hh_trafo_complex_kernel_8_double_@Base 2019.11.001
+ hh_trafo_kernel_12_generic_double_@Base 2019.11.001
+ hh_trafo_kernel_4_generic_double_@Base 2019.11.001
+ hh_trafo_kernel_8_generic_double_@Base 2019.11.001
+ max_threads_glob@Base 2019.11.001
+ mpi_fortran_argv_null_@Base 2019.11.001
+ mpi_fortran_argvs_null_@Base 2019.11.001
+ mpi_fortran_bottom_@Base 2019.11.001
+ mpi_fortran_errcodes_ignore_@Base 2019.11.001
+ mpi_fortran_in_place_@Base 2019.11.001
+ mpi_fortran_status_ignore_@Base 2019.11.001
+ mpi_fortran_statuses_ignore_@Base 2019.11.001
+ mpi_fortran_unweighted_@Base 2019.11.001
+ mpi_fortran_weights_empty_@Base 2019.11.001
+ quad_hh_trafo_real_generic_simple_4hv_double_@Base 2019.11.001
+ set_max_threads_glob@Base 2019.11.001
+ single_hh_trafo_complex_generic_double_@Base 2019.11.001
+ single_hh_trafo_complex_generic_simple_double_@Base 2019.11.001
diff -Nru elpa-2016.05.001/debian/libelpa4.install elpa-2019.11.001/debian/libelpa4.install
--- elpa-2016.05.001/debian/libelpa4.install	2018-06-24 09:48:43.000000000 +0000
+++ elpa-2019.11.001/debian/libelpa4.install	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
-usr/lib/*/libelpa.so.*
diff -Nru elpa-2016.05.001/debian/libelpa4.symbols elpa-2019.11.001/debian/libelpa4.symbols
--- elpa-2016.05.001/debian/libelpa4.symbols	2018-06-24 09:48:43.000000000 +0000
+++ elpa-2019.11.001/debian/libelpa4.symbols	1970-01-01 00:00:00.000000000 +0000
@@ -1,110 +0,0 @@
-libelpa.so.4 libelpa4 #MINVER#
- __complex_generic_kernel_MOD_single_hh_trafo_complex_generic@Base 2014.06.001
- __complex_generic_simple_kernel_MOD_single_hh_trafo_complex_generic_simple@Base 2014.06.001
- __compute_hh_trafo_complex_MOD_compute_hh_trafo_complex_cpu@Base 2016.05.001
- __compute_hh_trafo_real_MOD_compute_hh_trafo_real_cpu@Base 2016.05.001
- __elpa1_MOD_elpa_print_times@Base 2013.11.008
- __elpa1_MOD_get_elpa_communicators@Base 2016.05.001
- __elpa1_MOD_solve_evp_complex_1stage@Base 2016.05.001
- __elpa1_MOD_solve_evp_real_1stage@Base 2016.05.001
- __elpa1_MOD_time_evp_back@Base 2013.11.008
- __elpa1_MOD_time_evp_fwd@Base 2013.11.008
- __elpa1_MOD_time_evp_solve@Base 2013.11.008
- __elpa1_compute_MOD_cholesky_complex@Base 2016.05.001
- __elpa1_compute_MOD_cholesky_real@Base 2016.05.001
- __elpa1_compute_MOD_elpa_reduce_add_vectors_complex@Base 2016.05.001
- __elpa1_compute_MOD_elpa_reduce_add_vectors_real@Base 2016.05.001
- __elpa1_compute_MOD_elpa_transpose_vectors_complex@Base 2016.05.001
- __elpa1_compute_MOD_elpa_transpose_vectors_real@Base 2016.05.001
- __elpa1_compute_MOD_hh_transform_complex@Base 2016.05.001
- __elpa1_compute_MOD_hh_transform_real@Base 2016.05.001
- __elpa1_compute_MOD_invert_trm_complex@Base 2016.05.001
- __elpa1_compute_MOD_invert_trm_real@Base 2016.05.001
- __elpa1_compute_MOD_least_common_multiple@Base 2016.05.001
- __elpa1_compute_MOD_local_index@Base 2016.05.001
- __elpa1_compute_MOD_mult_ah_b_complex@Base 2016.05.001
- __elpa1_compute_MOD_mult_at_b_real@Base 2016.05.001
- __elpa1_compute_MOD_solve_tridi@Base 2016.05.001
- __elpa1_compute_MOD_trans_ev_complex@Base 2016.05.001
- __elpa1_compute_MOD_trans_ev_real@Base 2016.05.001
- __elpa1_compute_MOD_tridiag_complex@Base 2016.05.001
- __elpa1_compute_MOD_tridiag_real@Base 2016.05.001
- __elpa2_MOD_solve_evp_complex_2stage@Base 2013.11.008
- __elpa2_MOD_solve_evp_real_2stage@Base 2013.11.008
- __elpa2_compute_MOD_band_band_real@Base 2016.05.001
- __elpa2_compute_MOD_bandred_complex@Base 2016.05.001
- __elpa2_compute_MOD_bandred_real@Base 2016.05.001
- __elpa2_compute_MOD_divide_band@Base 2016.05.001
- __elpa2_compute_MOD_trans_ev_band_to_full_complex@Base 2016.05.001
- __elpa2_compute_MOD_trans_ev_band_to_full_real@Base 2016.05.001
- __elpa2_compute_MOD_trans_ev_tridi_to_band_complex@Base 2016.05.001
- __elpa2_compute_MOD_trans_ev_tridi_to_band_real@Base 2016.05.001
- __elpa2_compute_MOD_tridiag_band_complex@Base 2016.05.001
- __elpa2_compute_MOD_tridiag_band_real@Base 2016.05.001
- __elpa2_compute_MOD_which_qr_decomposition@Base 2016.05.001
- __elpa2_utilities_MOD_available_complex_elpa_kernels@Base 2015.02.001
- __elpa2_utilities_MOD_available_real_elpa_kernels@Base 2015.02.001
- __elpa2_utilities_MOD_check_allowed_complex_kernels@Base 2015.02.001
- __elpa2_utilities_MOD_check_allowed_real_kernels@Base 2015.02.001
- __elpa2_utilities_MOD_complex_elpa_kernel_names@Base 2015.02.001
- __elpa2_utilities_MOD_get_actual_complex_kernel@Base 2015.02.001
- __elpa2_utilities_MOD_get_actual_complex_kernel_name@Base 2015.02.001
- __elpa2_utilities_MOD_get_actual_real_kernel@Base 2015.02.001
- __elpa2_utilities_MOD_get_actual_real_kernel_name@Base 2015.02.001
- __elpa2_utilities_MOD_print_available_complex_kernels@Base 2015.02.001
- __elpa2_utilities_MOD_print_available_real_kernels@Base 2015.02.001
- __elpa2_utilities_MOD_qr_decomposition_via_environment_variable@Base 2015.02.001
- __elpa2_utilities_MOD_query_available_complex_kernels@Base 2016.05.001
- __elpa2_utilities_MOD_query_available_real_kernels@Base 2016.05.001
- __elpa2_utilities_MOD_real_elpa_kernel_names@Base 2015.02.001
- __elpa_mpi_stubs_MOD_mpi_comm_rank@Base 2016.05.001
- __elpa_mpi_stubs_MOD_mpi_comm_size@Base 2016.05.001
- __elpa_mpi_stubs_MOD_mpi_comm_split@Base 2016.05.001
- __elpa_mpi_stubs_MOD_mpi_wtime@Base 2016.05.001
- __elpa_pdgeqrf_MOD_qr_pdgeqrf_2dcomm@Base 2015.02.001
- __elpa_pdgeqrf_MOD_qr_pdlarfg2_1dcomm_check@Base 2015.02.001
- __elpa_pdgeqrf_MOD_qr_pqrparam_init@Base 2015.02.001
- __elpa_pdlarfb_MOD_qr_pdlarfb_1dcomm@Base 2015.02.001
- __elpa_pdlarfb_MOD_qr_pdlarfl2_tmatrix_1dcomm@Base 2015.02.001
- __elpa_pdlarfb_MOD_qr_pdlarfl_1dcomm@Base 2015.02.001
- __elpa_pdlarfb_MOD_qr_pdlarft_pdlarfb_1dcomm@Base 2015.02.001
- __elpa_pdlarfb_MOD_qr_pdlarft_set_merge_1dcomm@Base 2015.02.001
- __elpa_pdlarfb_MOD_qr_pdlarft_tree_merge_1dcomm@Base 2015.02.001
- __elpa_pdlarfb_MOD_qr_tmerge_pdlarfb_1dcomm@Base 2015.02.001
- __elpa_utilities_MOD_debug_messages_via_environment_variable@Base 2015.02.001
- __elpa_utilities_MOD_pcol@Base 2015.02.002
- __elpa_utilities_MOD_prow@Base 2015.02.002
- __pack_unpack_complex_MOD_pack_row_complex_cpu@Base 2016.05.001
- __pack_unpack_complex_MOD_unpack_row_complex_cpu@Base 2016.05.001
- __pack_unpack_real_MOD_pack_row_real_cpu@Base 2016.05.001
- __pack_unpack_real_MOD_unpack_row_real_cpu@Base 2016.05.001
- __qr_utils_mod_MOD_local_size_offset_1d@Base 2015.02.001
- __qr_utils_mod_MOD_reverse_matrix_1dcomm@Base 2015.02.001
- __qr_utils_mod_MOD_reverse_matrix_2dcomm_ref@Base 2015.02.001
- __qr_utils_mod_MOD_reverse_matrix_local@Base 2015.02.001
- __qr_utils_mod_MOD_reverse_vector_local@Base 2015.02.001
- __real_generic_kernel_MOD_double_hh_trafo_generic@Base 2016.05.001
- __real_generic_simple_kernel_MOD_double_hh_trafo_generic_simple@Base 2014.06.001
- __single_hh_trafo_real_MOD_single_hh_trafo_real_cpu@Base 2016.05.001
- (arch=amd64)double_hh_trafo@Base 2016.05.001
- elpa_get_communicators@Base 2015.02.002
- elpa_solve_evp_complex_1stage@Base 2015.02.002
- elpa_solve_evp_complex_2stage@Base 2015.02.002
- elpa_solve_evp_real_1stage@Base 2015.02.002
- elpa_solve_evp_real_2stage@Base 2015.02.002
- get_elpa_communicators@Base 2016.05.001
- (optional)mpi_fortran_argv_null_@Base 2013.11.008
- (optional)mpi_fortran_argvs_null_@Base 2013.11.008
- (optional)mpi_fortran_bottom_@Base 2013.11.008
- (optional)mpi_fortran_errcodes_ignore_@Base 2013.11.008
- (optional)mpi_fortran_in_place_@Base 2013.11.008
- (optional)mpi_fortran_status_ignore_@Base 2013.11.008
- (optional)mpi_fortran_statuses_ignore_@Base 2013.11.008
- (optional)mpi_fortran_unweighted_@Base 2016.05.001
- (optional)mpi_fortran_weights_empty_@Base 2016.05.001
- qr_dlarft_kernel_@Base 2015.02.001
- qr_pdlarfb_kernel_local_@Base 2015.02.001
- qr_pdlarft_merge_kernel_local_@Base 2015.02.001
- qr_tmerge_set_kernel_@Base 2015.02.001
- qr_tmerge_tree_kernel_@Base 2015.02.001
- (arch=amd64)single_hh_trafo_complex@Base 2016.05.001
diff -Nru elpa-2016.05.001/debian/libelpa-dev.docs elpa-2019.11.001/debian/libelpa-dev.docs
--- elpa-2016.05.001/debian/libelpa-dev.docs	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/debian/libelpa-dev.docs	2020-01-01 12:54:31.000000000 +0000
@@ -0,0 +1 @@
+USERS_GUIDE.md
diff -Nru elpa-2016.05.001/debian/patches/configure_fix_am_conditional.patch elpa-2019.11.001/debian/patches/configure_fix_am_conditional.patch
--- elpa-2016.05.001/debian/patches/configure_fix_am_conditional.patch	2018-06-24 09:48:43.000000000 +0000
+++ elpa-2019.11.001/debian/patches/configure_fix_am_conditional.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,22 +0,0 @@
---- ./configure.ac.orig	2016-05-28 16:05:41.963772844 +0200
-+++ ./configure.ac	2016-05-28 16:06:01.799772239 +0200
-@@ -70,7 +70,7 @@
- 
- dnl mpi
- AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi=[[yes|no]]], [compile with MPI. Default: yes])],,[with_mpi=yes])
--AM_CONDITIONAL([WITH_MPI],[test x"with_mpi" = x"yes"])
-+AM_CONDITIONAL([WITH_MPI],[test x"$with_mpi" = x"yes"])
- if test x"${with_mpi}" = x"yes"; then
-        AC_DEFINE([WITH_MPI], [1], [use MPI])
- fi
---- ./configure.orig	2016-05-28 16:10:03.995764856 +0200
-+++ ./configure	2016-05-28 16:10:06.471764780 +0200
-@@ -3302,7 +3302,7 @@
-   with_mpi=yes
- fi
- 
-- if test x"with_mpi" = x"yes"; then
-+ if test x"$with_mpi" = x"yes"; then
-   WITH_MPI_TRUE=
-   WITH_MPI_FALSE='#'
- else
diff -Nru elpa-2016.05.001/debian/patches/fix_module_dir.patch elpa-2019.11.001/debian/patches/fix_module_dir.patch
--- elpa-2016.05.001/debian/patches/fix_module_dir.patch	2018-06-24 09:48:43.000000000 +0000
+++ elpa-2019.11.001/debian/patches/fix_module_dir.patch	2020-01-01 15:18:49.000000000 +0000
@@ -1,26 +1,26 @@
-Index: elpa-2016.05.001/Makefile.am
+Index: elpa/Makefile.am
 ===================================================================
---- elpa-2016.05.001.orig/Makefile.am
-+++ elpa-2016.05.001/Makefile.am
-@@ -133,7 +133,7 @@ include generated_headers.am
+--- elpa.orig/Makefile.am
++++ elpa/Makefile.am
+@@ -436,7 +436,7 @@ include generated_headers.am
  BUILT_SOURCES = $(generated_headers)
  
- # install any .mod files in the include/ dir
+ # install public headers and Fortran modules files in the include/ dir
 -elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
 +elpa_includedir = $(includedir)/elpa@SUFFIX@
- nobase_elpa_include_HEADERS = $(wildcard modules/*)
- nobase_elpa_include_HEADERS += elpa/elpa.h elpa/elpa_kernel_constants.h elpa/elpa_generated.h
- 
-Index: elpa-2016.05.001/Makefile.in
+ nobase_elpa_include_HEADERS = \
+   $(wildcard modules/*) \
+   src/helpers/lapack_interfaces.h \
+Index: elpa/Makefile.in
 ===================================================================
---- elpa-2016.05.001.orig/Makefile.in
-+++ elpa-2016.05.001/Makefile.in
-@@ -1079,7 +1079,7 @@ generated_headers = config-f90.h elpa/el
+--- elpa.orig/Makefile.in
++++ elpa/Makefile.in
+@@ -7349,7 +7349,7 @@ generated_headers = config-f90.h elpa/el
  BUILT_SOURCES = $(generated_headers)
  
- # install any .mod files in the include/ dir
+ # install public headers and Fortran modules files in the include/ dir
 -elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
 +elpa_includedir = $(includedir)/elpa@SUFFIX@
- nobase_elpa_include_HEADERS = $(wildcard modules/*) elpa/elpa.h \
- 	elpa/elpa_kernel_constants.h elpa/elpa_generated.h
- dist_man_MANS = \
+ nobase_elpa_include_HEADERS = \
+   $(wildcard modules/*) \
+   src/helpers/lapack_interfaces.h \
diff -Nru elpa-2016.05.001/debian/patches/series elpa-2019.11.001/debian/patches/series
--- elpa-2016.05.001/debian/patches/series	2018-06-24 09:48:43.000000000 +0000
+++ elpa-2019.11.001/debian/patches/series	2020-01-01 19:31:00.000000000 +0000
@@ -1,4 +1 @@
 fix_module_dir.patch
-testsuite_force_default_parameters.patch
-testsuite_custom_mpiexec.patch
-configure_fix_am_conditional.patch
diff -Nru elpa-2016.05.001/debian/patches/testsuite_custom_mpiexec.patch elpa-2019.11.001/debian/patches/testsuite_custom_mpiexec.patch
--- elpa-2016.05.001/debian/patches/testsuite_custom_mpiexec.patch	2018-06-24 09:48:43.000000000 +0000
+++ elpa-2019.11.001/debian/patches/testsuite_custom_mpiexec.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,50 +0,0 @@
---- elpa-2016.05.001.orig/Makefile.am
-+++ elpa-2016.05.001/Makefile.am
-@@ -304,16 +304,17 @@ check_SCRIPTS += \
-   elpa2_test_complex_c_version@SUFFIX@.sh
- endif
- 
-+MPIEXEC ?= mpiexec -n 2 --oversubscribe
- 
- # test scripts
- if WITH_MPI
--  wrapper="mpiexec -n 2 "
-+  wrapper=$(MPIEXEC)
- else
--  wrapper=""
-+  wrapper=
- endif
- TESTS = $(check_SCRIPTS)
- %.sh: %
--	echo '$(wrapper)./$^ $$TEST_FLAGS' > $@
-+	echo '$(wrapper) ./$^ $$TEST_FLAGS' > $@
- 	chmod +x $@
- 
- ## this one does not want any arguments
---- elpa-2016.05.001.orig/Makefile.in
-+++ elpa-2016.05.001/Makefile.in
-@@ -1195,10 +1185,10 @@ check_SCRIPTS = elpa1_test_real@SUFFIX@.
- 	elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \
- 	elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \
- 	elpa2_print_kernels@SUFFIX@ $(am__append_23)
--@WITH_MPI_FALSE@wrapper = ""
-+@WITH_MPI_FALSE@wrapper = 
- 
- # test scripts
--@WITH_MPI_TRUE@wrapper = "mpiexec -n 2 "
-+@WITH_MPI_TRUE@wrapper = $(MPIEXEC)
- @DX_COND_doc_TRUE@@DX_COND_html_TRUE@DX_CLEAN_HTML = @DX_DOCDIR@/html
- @DX_COND_chm_TRUE@@DX_COND_doc_TRUE@DX_CLEAN_CHM = @DX_DOCDIR@/chm
- @DX_COND_chi_TRUE@@DX_COND_chm_TRUE@@DX_COND_doc_TRUE@DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi
-@@ -2639,8 +2628,10 @@ elpa/elpa_generated_fortran_interfaces.h
- 	$(call extract_interface,!f>)
- 	$(call extract_interface,#!f>)
- generated-headers: $(generated_headers)
-+
-+MPIEXEC ?= mpiexec -n 2 --oversubscribe
- %.sh: %
--	echo '$(wrapper)./$^ $$TEST_FLAGS' > $@
-+	echo '$(wrapper) ./$^ $$TEST_FLAGS' > $@
- 	chmod +x $@
- 
- #elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh:
diff -Nru elpa-2016.05.001/debian/patches/testsuite_force_default_parameters.patch elpa-2019.11.001/debian/patches/testsuite_force_default_parameters.patch
--- elpa-2016.05.001/debian/patches/testsuite_force_default_parameters.patch	2018-06-24 09:48:43.000000000 +0000
+++ elpa-2019.11.001/debian/patches/testsuite_force_default_parameters.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,80 +0,0 @@
---- ./test/c_test_programs/elpa1_test_real_c_version.c.orig	2016-05-28 14:36:25.507936145 +0200
-+++ ./test/c_test_programs/elpa1_test_real_c_version.c	2016-05-28 14:44:17.651921751 +0200
-@@ -92,10 +92,17 @@
-    myid = 0;
-    MPI_COMM_WORLD=1;
- #endif
-+
-    na = 1000;
-    nev = 500;
-    nblk = 16;
- 
-+   if( argc == 4 ) {
-+     na = atoi(argv[1]);
-+     nev = atoi(argv[2]);
-+     nblk = atoi(argv[3]);
-+   }
-+
-    if (myid == 0) {
-      printf("This is the c version of an ELPA test-programm\n");
-      printf("\n");
---- ./test/c_test_programs/elpa1_test_complex_c_version.c.orig	2016-05-20 07:09:52.000000000 +0200
-+++ ./test/c_test_programs/elpa1_test_complex_c_version.c	2016-05-28 14:45:31.703919493 +0200
-@@ -94,10 +94,17 @@
-    myid=0;
-    MPI_COMM_WORLD=1;
- #endif
-+
-    na = 1000;
-    nev = 500;
-    nblk = 16;
- 
-+   if( argc == 4 ) {
-+     na = atoi(argv[1]);
-+     nev = atoi(argv[2]);
-+     nblk = atoi(argv[3]);
-+   }
-+
-    if (myid == 0) {
-      printf("This is the c version of an ELPA test-programm\n");
-      printf("\n");
---- ./test/c_test_programs/elpa2_test_real_c_version.c.orig	2016-05-20 07:09:52.000000000 +0200
-+++ ./test/c_test_programs/elpa2_test_real_c_version.c	2016-05-28 14:45:55.907918755 +0200
-@@ -93,10 +93,17 @@
-    myid=0;
-    MPI_COMM_WORLD=1;
- #endif
-+
-    na = 1000;
-    nev = 500;
-    nblk = 16;
- 
-+   if( argc == 4 ) {
-+     na = atoi(argv[1]);
-+     nev = atoi(argv[2]);
-+     nblk = atoi(argv[3]);
-+   }
-+
-    if (myid == 0) {
-      printf("This is the c version of an ELPA test-programm\n");
-      printf("\n");
---- ./test/c_test_programs/elpa2_test_complex_c_version.c.orig	2016-05-20 07:09:52.000000000 +0200
-+++ ./test/c_test_programs/elpa2_test_complex_c_version.c	2016-05-28 14:46:11.707918274 +0200
-@@ -96,10 +96,17 @@
-    myid =0;
-    MPI_COMM_WORLD=1;
- #endif
-+
-    na = 1000;
-    nev = 500;
-    nblk = 16;
- 
-+   if( argc == 4 ) {
-+     na = atoi(argv[1]);
-+     nev = atoi(argv[2]);
-+     nblk = atoi(argv[3]);
-+   }
-+
-    if (myid == 0) {
-      printf("This is the c version of an ELPA test-programm\n");
-      printf("\n");
diff -Nru elpa-2016.05.001/debian/rules elpa-2019.11.001/debian/rules
--- elpa-2016.05.001/debian/rules	2018-06-24 19:19:52.000000000 +0000
+++ elpa-2019.11.001/debian/rules	2020-10-03 10:36:17.000000000 +0000
@@ -13,7 +13,9 @@
 	export DEB_LDFLAGS_MAINT_APPEND=-Wl,--as-needed
 endif
 
-export TEST_FLAGS?=2000 750 8
+export DEB_FCFLAGS_MAINT_APPEND = -fallow-argument-mismatch
+
+export TEST_FLAGS?=500 150 16
 
 ifneq (,$(filter $(DEB_HOST_ARCH), mips mipsel powerpc))
 	export TEST_FLAGS=200 12 2
@@ -23,19 +25,23 @@
         export MPIEXEC=mpiexec -n 1
 endif
 
+ifeq (,$(filter custom,$(DEB_BUILD_OPTIONS)))
+BUILD_FLAGS=--disable-sse --disable-sse-assembly --disable-avx --disable-avx2 --disable-avx512
+endif
+
+# the current upstream version (2019.11.001) FTBFS when make is run in
+# parallel, so force parallel to off via --max-parallel=1
 %:
-	dh $@
+	dh $@ --max-parallel=1 --builddir=build
 
 override_dh_auto_configure:
-	dh_auto_configure -- --disable-silent-rules
+	dh_auto_configure -- --disable-silent-rules ${BUILD_FLAGS}
 
 ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
 override_dh_auto_test:
-	@echo "Running tests with matrix size ${ELPA_TEST_SCALE}"
-	$(MAKE) -j1 check || cat test-suite.log
-	grep Total.time *.log
-	grep -A1 ^Matrix.size *.log
-	./elpa2_print_kernels
+	@echo "Running tests with matrix size ${TEST_FLAGS% *}"
+	(cd build; $(MAKE) -j1 check || cat test-suite.log)
+	build/elpa2_print_kernels
 	# error out in case of test suite failures
-	if grep ^FAIL test-suite.log; then exit 1; fi
+	if grep ^FAIL build/test-suite.log; then exit 1; fi
 endif
diff -Nru elpa-2016.05.001/depcomp elpa-2019.11.001/depcomp
--- elpa-2016.05.001/depcomp	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/depcomp	2019-12-21 16:29:47.000000000 +0000
@@ -1,9 +1,9 @@
 #! /bin/sh
 # depcomp - compile a program generating dependencies as side-effects
 
-scriptversion=2013-05-30.07; # UTC
+scriptversion=2018-03-07.03; # UTC
 
-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -16,7 +16,7 @@
 # GNU General Public License for more details.
 
 # You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -783,9 +783,9 @@
 # Local Variables:
 # mode: shell-script
 # sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "scriptversion="
 # time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
+# time-stamp-time-zone: "UTC0"
 # time-stamp-end: "; # UTC"
 # End:
diff -Nru elpa-2016.05.001/Doxyfile.in elpa-2019.11.001/Doxyfile.in
--- elpa-2016.05.001/Doxyfile.in	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/Doxyfile.in	2019-12-19 09:47:40.000000000 +0000
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.10
+# Doxyfile 1.8.12
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -32,7 +32,7 @@
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = Eigenvalue SoLvers for Petaflop-Applications (ELPA)
+PROJECT_NAME           = "Eigenvalue SoLvers for Petaflop-Applications (ELPA)"
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -44,7 +44,7 @@
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
 
-0PROJECT_BRIEF          = "Eigenvalue SoLvers for Petaflop-Applications (ELPA)"
+#PROJECT_BRIEF          = "Eigenvalue SoLvers for Petaflop-Applications (ELPA)"
 
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
@@ -118,7 +118,17 @@
 # the entity):The $name class, The $name widget, The $name file, is, provides,
 # specifies, contains, represents, a, an and the.
 
-ABBREVIATE_BRIEF       =
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
 
 # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
 # doxygen will generate a detailed section even if there is only a brief
@@ -256,7 +266,7 @@
 # sources. Doxygen will then generate output that is tailored for Fortran.
 # The default value is: NO.
 
-OPTIMIZE_FOR_FORTRAN   = Yes
+OPTIMIZE_FOR_FORTRAN   = YES
 
 # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
 # sources. Doxygen will then generate output that is tailored for VHDL.
@@ -293,6 +303,15 @@
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -739,6 +758,12 @@
 
 WARN_NO_PARAMDOC       = NO
 
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
 # The WARN_FORMAT tag determines the format of the warning messages that doxygen
 # can produce. The string should contain the $file, $line, and $text tags, which
 # will be replaced by the file and line number from which the warning originated
@@ -765,7 +790,7 @@
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = @top_srcdir@/src @top_srcdir@/test @builddir@/elpa
+INPUT                  = @top_srcdir@/elpa/ @top_srcdir@/src @top_srcdir@/test @builddir@/elpa @builddir@/config-f90.h
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -790,7 +815,52 @@
 # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
 # *.vhdl, *.ucf, *.qsf, *.as and *.js.
 
-FILE_PATTERNS          =
+FILE_PATTERNS          =*.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+			 *.F90 \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f \
+			 *.F \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -805,82 +875,139 @@
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = @top_srcdir@/src/elpa1_compute.F90 \
-                         @top_srcdir@/src/mod_precision.f90 \
-                         @top_srcdir@/src/aligned_mem.F90 \
-                         @top_srcdir@/src/mod_compute_hh_trafo_real.F90 \
-                         @top_srcdir@/src/mod_compute_hh_trafo_complex.F90 \
-                         @top_srcdir@/src/mod_mpi.F90 \
-                         @top_srcdir@/src/mod_mpi_stubs.F90 \
-                         @top_srcdir@/src/mod_time_c.F90 \
-                         @top_srcdir@/src/mod_pack_unpack_complex.F90 \
-                         @top_srcdir@/src/mod_pack_unpack_real.F90 \
-                         @top_srcdir@/src/elpa2_compute.F90 \
-			 @top_srcdir@/src/elpa2_utilities.F90 \
-			 @top_srcdir@/src/elpa_c_interface.F90 \
-			 @top_srcdir@/src/elpa_reduce_add_vectors.X90 \
-			 @top_srcdir@/src/elpa_transpose_vectors.X90 \
-			 @top_srcdir@/src/elpa_utilities.F90 \
-			 @top_srcdir@/src/timer.F90 \
-			 @top_srcdir@/src/redist_band.X90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s \
-			 @top_srcdir@/src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
-			 @top_srcdir@/src/elpa2_kernels/mod_fortran_interfaces.F90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real.F90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_simple.F90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex.F90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_simple.F90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c \
-			 @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c \
-			 @top_srcdir@/src/elpa_qr/elpa_pdgeqrf.F90 \
-			 @top_srcdir@/src/elpa_qr/elpa_pdlarfb.F90 \
-			 @top_srcdir@/src/elpa_qr/elpa_qrkernels.f90 \
-			 @top_srcdir@/src/elpa_qr/qr_utils.F90 \
-			 @top_srcdir@/src/ftimings/ftimings.F90 \
-			 @top_srcdir@/src/ftimings/ftimings_type.F90 \
-			 @top_srcdir@/src/ftimings/ftimings_value.F90 \
-			 @top_srcdir@/src/ftimings/highwater_mark.c \
-			 @top_srcdir@/src/ftimings/papi.c \
-			 @top_srcdir@/src/ftimings/resident_set_size.c \
-			 @top_srcdir@/src/ftimings/time.c \
-			 @top_srcdir@/src/ftimings/virtual_memory.c \
-			 @top_srcdir@/test/shared_sources/mod_output_types.F90 \
-			 @top_srcdir@/test/c_test_programs/elpa1_test_complex_c_version.c \
-			 @top_srcdir@/test/c_test_programs/elpa1_test_real_c_version.c \
-			 @top_srcdir@/test/c_test_programs/elpa2_test_complex_c_version.c \
-			 @top_srcdir@/test/c_test_programs/elpa2_test_real_c_version.c \
-			 @top_srcdir@/test/fortran_test_programs/read_real.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_complex2_default_kernel.F90 \
-                         @top_srcdir@/test/fortran_test_programs/test_complex2.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_complex.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real2_default_kernel.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real2.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real.F90 \
-			 @top_srcdir@/test/fortran_test_programs/test_real_with_c.F90 \
-			 @top_srcdir@/test/shared_sources/blacs_infrastructure.F90 \
-			 @top_srcdir@/test/shared_sources/call_elpa1.c \
-			 @top_srcdir@/test/shared_sources/call_elpa2.c \
-			 @top_srcdir@/test/shared_sources/check_correctnes.F90 \
-			 @top_srcdir@/test/shared_sources/mod_from_c.F90 \
-			 @top_srcdir@/test/shared_sources/prepare_matrix.F90 \
-			 @top_srcdir@/test/shared_sources/read_input_parameters.F90 \
-			 @top_srcdir@/test/shared_sources/redir.c \
-			 @top_srcdir@/test/shared_sources/redirect.F90 \
-			 @top_srcdir@/test/shared_sources/setup_mpi.F90 \
-			 @top_srcdir@/test/shared_sources/util.F90
+EXCLUDE                = @top_srcdir@/src/GPU/check_for_gpu.F90 \
+			@top_srcdir@/fortran_constants.h \
+			@top_srcdir@/src/elpa_index.c \
+			@top_srcdir@/src/elpa_driver/legacy_interface/elpa_driver_c_interface_template.F90 \
+			@top_srcdir@/src/elpa_driver/legacy_interface/elpa.F90 \
+			@top_srcdir@/src/elpa_driver/legacy_interface/elpa_driver_c_interface.F90 \
+			@top_srcdir@/src/elpa_constants.F90 \
+			@top_srcdir@/src/elpa_c_interface.c \
+			@top_srcdir@/src/elpa2/mod_pack_unpack_cpu.F90 \
+			@top_srcdir@/src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 \
+			@top_srcdir@/src/elpa2/elpa2_ssymm_matrix_allreduce_real_template.F90 \
+			@top_srcdir@/src/elpa2/elpa2_tridiag_band_template.F90 \
+			@top_srcdir@/src/elpa2/mod_redist_band.F90 \
+			@top_srcdir@/src/elpa2/pack_unpack_cpu.F90 \
+			@top_srcdir@/src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90 \
+			@top_srcdir@/src/elpa2/elpa2_print_kernels.F90 \
+			@top_srcdir@/src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
+			@top_srcdir@/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
+			@top_srcdir@/src/elpa2/kernels/simple_template.F90 \
+			@top_srcdir@/src/elpa2/kernels/real_template.F90 \
+			@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
+			@top_srcdir@/src/elpa2/kernels/complex_template.F90 \
+			@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \
+			@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \
+			@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_simple.F90 \
+			@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real.F90 \
+			@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
+			@top_srcdir@/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_bgq.f90 \
+			@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_simple.F90 \
+			@top_srcdir@/src/elpa2/kernels/complex.F90 \
+			@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_bgp.f90 \
+			@top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \
+			@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_single_precision.c \
+			@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c \
+			@top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \
+			@top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \
+			@top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \
+			@top_srcdir@/src/elpa2/mod_pack_unpack_gpu.F90 \
+			@top_srcdir@/src/elpa2/legacy_interface/elpa_2stage_c_interface.F90 \
+			@top_srcdir@/src/elpa2/legacy_interface/elpa2_c_interface_template.F90 \
+			@top_srcdir@/src/elpa2/legacy_interface/elpa2_template.F90 \
+			@top_srcdir@/src/elpa2/legacy_interface/elpa2_utilities.F90 \
+			@top_srcdir@/src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu \
+			@top_srcdir@/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu \
+			@top_srcdir@/src/elpa2/GPU/interface_c_kernel.F90 \
+			@top_srcdir@/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu \
+			@top_srcdir@/src/elpa2/elpa2_compute.F90 \
+			@top_srcdir@/src/elpa2/redist_band.F90 \
+			@top_srcdir@/src/elpa2/elpa2_template.F90 \
+			@top_srcdir@/src/elpa2/elpa2_determine_workload.F90 \
+			@top_srcdir@/src/elpa2/mod_compute_hh_trafo.F90 \
+			@top_srcdir@/src/elpa2/qr/elpa_pdlarfb.F90 \
+			@top_srcdir@/src/elpa2/qr/qr_utils_template.F90 \
+			@top_srcdir@/src/elpa2/qr/elpa_pdgeqrf.F90 \
+			@top_srcdir@/src/elpa2/qr/elpa_pdlarfb_template.F90 \
+			@top_srcdir@/src/elpa2/qr/elpa_qrkernels_template.F90 \
+			@top_srcdir@/src/elpa2/qr/qr_utils.F90 \
+			@top_srcdir@/src/elpa2/qr/elpa_pdgeqrf_template.F90 \
+			@top_srcdir@/src/elpa2/qr/elpa_qrkernels.F90 \
+			@top_srcdir@/src/elpa2/elpa2_compute_real_template.F90 \
+			@top_srcdir@/src/elpa2/compute_hh_trafo.F90 \
+			@top_srcdir@/src/elpa_generated_fortran_interfaces.F90 \
+			@top_srcdir@/elpa/elpa_index.h \
+			@top_srcdir@/src/elpa1/elpa1_solve_tridi_real_template.F90 \
+			@top_srcdir@/src/elpa1/elpa_multiply_a_b.F90 \
+			@top_srcdir@/src/elpa1/elpa_cholesky_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_mult_at_b_c_interface_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_multiply_a_b.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_invert_trm_c_interface_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_cholesky_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa1_c_interface_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_cholesky_c_interface_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_solve_tridi_c_interface_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa1_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa1_auxiliary.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_1stage_c_interface.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_mult_ah_b_c_interface_template.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_invert_trm.F90 \
+			@top_srcdir@/src/elpa1/legacy_interface/elpa_solve_tridi.F90 \
+			@top_srcdir@/src/elpa1/elpa1_compute_template.F90 \
+			@top_srcdir@/src/elpa1/elpa_reduce_add_vectors.F90 \
+			@top_srcdir@/src/elpa1/elpa1_merge_systems_real_template.F90 \
+			@top_srcdir@/src/elpa1/elpa1_compute_private.F90 \
+			@top_srcdir@/src/elpa1/elpa1_template.F90 \
+			@top_srcdir@/src/elpa1/elpa_solve_tridi_impl_public.F90 \
+			@top_srcdir@/src/elpa1/elpa1_trans_ev_template.F90 \
+			@top_srcdir@/src/elpa1/elpa_transpose_vectors.F90 \
+			@top_srcdir@/src/elpa1/elpa_transpose_vectors_ss.F90 \
+			@top_srcdir@/src/elpa1/elpa1_auxiliary.F90 \
+			@top_srcdir@/src/elpa1/elpa1_tridiag_template.F90 \
+			@top_srcdir@/src/elpa1/elpa1_tools_template.F90 \
+			@top_srcdir@/src/elpa1/elpa_invert_trm.F90 \
+			@top_srcdir@/src/elpa1/elpa1_utilities.F90
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -896,7 +1023,12 @@
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       =
+EXCLUDE_PATTERNS       =  */test/* \
+                          */kernels/* \
+                          */ftimings/* \
+                          */general/* \
+                          */helpers/* \
+                          */GPU/*
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -920,7 +1052,7 @@
 # *.h) to filter out the source-files in the directories. If left blank all
 # files are included.
 
-EXAMPLE_PATTERNS       =
+EXAMPLE_PATTERNS       = *
 
 # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
 # searched for input files to be used with the \include or \dontinclude commands
@@ -949,6 +1081,10 @@
 # Note that the filter must not add or remove lines; it is applied before the
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
@@ -958,6 +1094,10 @@
 # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
@@ -1800,6 +1940,14 @@
 
 LATEX_BIB_STYLE        = plain
 
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -2031,7 +2179,7 @@
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
@@ -2053,7 +2201,7 @@
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           =
+INCLUDE_PATH           = @builddir@ @builddir@/elpa
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
diff -Nru elpa-2016.05.001/elpa/elpa_build_config.h.in elpa-2019.11.001/elpa/elpa_build_config.h.in
--- elpa-2016.05.001/elpa/elpa_build_config.h.in	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/elpa/elpa_build_config.h.in	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,2 @@
+// The stored build config
+
diff -Nru elpa-2016.05.001/elpa/elpa_constants.h.in elpa-2019.11.001/elpa/elpa_constants.h.in
--- elpa-2016.05.001/elpa/elpa_constants.h.in	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/elpa/elpa_constants.h.in	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,136 @@
+#pragma once
+
+/* This might seem over-engineered, but helps to re-use this file also on the
+ * Fortran side and thus to keep the definitions in this one place here
+ */
+
+/* Private helper macros */
+#define ELPA_ENUM_ENTRY(name, value, ...) \
+        name = value,
+#define ELPA_ENUM_SUM(name, value, ...) +1
+
+
+/* Solver constants */
+#define ELPA_FOR_ALL_SOLVERS(X) \
+        X(ELPA_SOLVER_1STAGE, 1) \
+        X(ELPA_SOLVER_2STAGE, 2)
+
+enum ELPA_SOLVERS {
+        ELPA_FOR_ALL_SOLVERS(ELPA_ENUM_ENTRY)
+};
+
+#define ELPA_NUMBER_OF_SOLVERS (0 ELPA_FOR_ALL_SOLVERS(ELPA_ENUM_SUM))
+
+/* Kernel constants */
+#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X, ...) \
+        X(ELPA_2STAGE_REAL_GENERIC, 1, @ELPA_2STAGE_REAL_GENERIC_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_GENERIC_SIMPLE, 2, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_BGP, 3, @ELPA_2STAGE_REAL_BGP_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_BGQ, 4, @ELPA_2STAGE_REAL_BGQ_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_SSE_ASSEMBLY, 5, @ELPA_2STAGE_REAL_SSE_ASSEMBLY_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_SSE_BLOCK2, 6, @ELPA_2STAGE_REAL_SSE_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_SSE_BLOCK4, 7, @ELPA_2STAGE_REAL_SSE_BLOCK4_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_SSE_BLOCK6, 8, @ELPA_2STAGE_REAL_SSE_BLOCK6_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX_BLOCK2, 9, @ELPA_2STAGE_REAL_AVX_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX_BLOCK4, 10, @ELPA_2STAGE_REAL_AVX_BLOCK4_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX_BLOCK6, 11, @ELPA_2STAGE_REAL_AVX_BLOCK6_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX2_BLOCK2, 12, @ELPA_2STAGE_REAL_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX2_BLOCK4, 13, @ELPA_2STAGE_REAL_AVX2_BLOCK4_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX2_BLOCK6, 14, @ELPA_2STAGE_REAL_AVX2_BLOCK6_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 29, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
+
+#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
+        ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
+        X(ELPA_2STAGE_REAL_INVALID, -1, choke me) \
+        X(ELPA_2STAGE_REAL_DEFAULT, @ELPA_2STAGE_REAL_DEFAULT@, choke me)
+
+enum ELPA_REAL_KERNELS {
+        ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(ELPA_ENUM_ENTRY)
+};
+
+
+#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X, ...) \
+        X(ELPA_2STAGE_COMPLEX_GENERIC, 1, @ELPA_2STAGE_COMPLEX_GENERIC_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE, 2, @ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_BGP, 3, @ELPA_2STAGE_COMPLEX_BGP_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_BGQ, 4, @ELPA_2STAGE_COMPLEX_BGQ_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY, 5, @ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_SSE_BLOCK1, 6, @ELPA_2STAGE_COMPLEX_SSE_BLOCK1_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_SSE_BLOCK2, 7, @ELPA_2STAGE_COMPLEX_SSE_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_AVX_BLOCK1, 8, @ELPA_2STAGE_COMPLEX_AVX_BLOCK1_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_AVX_BLOCK2, 9, @ELPA_2STAGE_COMPLEX_AVX_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1, 10, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK1_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2, 11, @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1, 12, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2, 13, @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
+        X(ELPA_2STAGE_COMPLEX_GPU, 14, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) 
+
+#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
+        ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
+        X(ELPA_2STAGE_COMPLEX_INVALID, -1, choke me) \
+        X(ELPA_2STAGE_COMPLEX_DEFAULT, @ELPA_2STAGE_COMPLEX_DEFAULT@, choke me)
+
+enum ELPA_COMPLEX_KERNELS {
+        ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(ELPA_ENUM_ENTRY)
+};
+
+
+
+/* General constants */
+#define ELPA_FOR_ALL_ERRORS(X) \
+        X(ELPA_OK, 0) \
+        X(ELPA_ERROR, -1) \
+        X(ELPA_ERROR_ENTRY_NOT_FOUND, -2) \
+        X(ELPA_ERROR_ENTRY_INVALID_VALUE, -3) \
+        X(ELPA_ERROR_ENTRY_ALREADY_SET, -4) \
+        X(ELPA_ERROR_ENTRY_NO_STRING_REPRESENTATION, -5) \
+        X(ELPA_ERROR_SETUP, -6) \
+        X(ELPA_ERROR_CRITICAL, -7) \
+        X(ELPA_ERROR_API_VERSION, -8) \
+        X(ELPA_ERROR_AUTOTUNE_API_VERSION, -9) \
+        X(ELPA_ERROR_AUTOTUNE_OBJECT_CHANGED, -10) \
+        X(ELPA_ERROR_ENTRY_READONLY, -11) \
+        X(ELPA_ERROR_CANNOT_OPEN_FILE, -12)
+
+enum ELPA_ERRORS {
+        ELPA_FOR_ALL_ERRORS(ELPA_ENUM_ENTRY)
+};
+
+enum ELPA_CONSTANTS {
+        ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS = (0 ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(ELPA_ENUM_SUM)),
+        ELPA_2STAGE_NUMBER_OF_REAL_KERNELS = (0 ELPA_FOR_ALL_2STAGE_REAL_KERNELS(ELPA_ENUM_SUM)),
+};
+
+#define ELPA_FOR_ALL_AUTOTUNE_LEVELS(X, ...) \
+        X(ELPA_AUTOTUNE_NOT_TUNABLE, 0) \
+        X(ELPA_AUTOTUNE_FAST, 1) \
+        X(ELPA_AUTOTUNE_MEDIUM, 2) \
+	X(ELPA_AUTOTUNE_EXTENSIVE, 3)
+
+enum ELPA_AUTOTUNE_LEVELS {
+        ELPA_FOR_ALL_AUTOTUNE_LEVELS(ELPA_ENUM_ENTRY)
+};
+
+
+#define ELPA_FOR_ALL_AUTOTUNE_DOMAINS(X, ...) \
+        X(ELPA_AUTOTUNE_DOMAIN_REAL, 1) \
+        X(ELPA_AUTOTUNE_DOMAIN_COMPLEX, 2) \
+        X(ELPA_AUTOTUNE_DOMAIN_ANY, 3)
+
+enum ELPA_AUTOTUNE_DOMAINS {
+        ELPA_FOR_ALL_AUTOTUNE_DOMAINS(ELPA_ENUM_ENTRY)
+};
diff -Nru elpa-2016.05.001/elpa/elpa_generated.h elpa-2019.11.001/elpa/elpa_generated.h
--- elpa-2016.05.001/elpa/elpa_generated.h	2016-05-20 07:51:42.000000000 +0000
+++ elpa-2019.11.001/elpa/elpa_generated.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,118 +0,0 @@
- #include <complex.h>
- /*! \brief C old, deprecated interface to create the MPI communicators for ELPA
- *
- * \param mpi_comm_word    MPI global communicator (in)
- * \param my_prow          Row coordinate of the calling process in the process grid (in)
- * \param my_pcol          Column coordinate of the calling process in the process grid (in)
- * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
- * \result int             integer error value of mpi_comm_split function
- */
- int elpa_get_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
- #include <complex.h>
- /*! \brief C interface to create the MPI communicators for ELPA
- *
- * \param mpi_comm_word    MPI global communicator (in)
- * \param my_prow          Row coordinate of the calling process in the process grid (in)
- * \param my_pcol          Column coordinate of the calling process in the process grid (in)
- * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
- * \result int             integer error value of mpi_comm_split function
- */
- int get_elpa_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
-  /*! \brief C interface to solve the real eigenvalue problem with 1-stage solver
-  *
- *  \param  na                   Order of matrix a
- *  \param  nev                  Number of eigenvalues needed.
- *                               The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
- *                               Distribution is like in Scalapack.
- *                               The full matrix must be set (not only one half like in scalapack).
- *  \param lda                   Leading dimension of a
- *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
- *  \param q                     On output: Eigenvectors of a
- *                               Distribution is like in Scalapack.
- *                               Must be always dimensioned to the full size (corresponding to (na,na))
- *                               even if only a part of the eigenvalues is needed.
- *  \param ldq                   Leading dimension of q
- *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols           distributed number of matrix columns
- *  \param mpi_comm_rows        MPI-Communicator for rows
- *  \param mpi_comm_cols        MPI-Communicator for columns
- *
- *  \result                     int: 1 if error occured, otherwise 0
-*/
- int elpa_solve_evp_real_1stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols);
- /*! \brief C interface to solve the complex eigenvalue problem with 1-stage solver
- *
- *  \param  na                   Order of matrix a
- *  \param  nev                  Number of eigenvalues needed.
- *                               The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
- *                               Distribution is like in Scalapack.
- *                               The full matrix must be set (not only one half like in scalapack).
- *  \param lda                   Leading dimension of a
- *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
- *  \param q                     On output: Eigenvectors of a
- *                               Distribution is like in Scalapack.
- *                               Must be always dimensioned to the full size (corresponding to (na,na))
- *                               even if only a part of the eigenvalues is needed.
- *  \param ldq                   Leading dimension of q
- *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols           distributed number of matrix columns
- *  \param mpi_comm_rows        MPI-Communicator for rows
- *  \param mpi_comm_cols        MPI-Communicator for columns
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_complex_1stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols);
- /*! \brief C interface to solve the real eigenvalue problem with 2-stage solver
- *
- *  \param  na                        Order of matrix a
- *  \param  nev                       Number of eigenvalues needed.
- *                                    The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
- *                                    Distribution is like in Scalapack.
- *                                    The full matrix must be set (not only one half like in scalapack).
- *  \param lda                        Leading dimension of a
- *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
- *  \param q                          On output: Eigenvectors of a
- *                                    Distribution is like in Scalapack.
- *                                    Must be always dimensioned to the full size (corresponding to (na,na))
- *                                    even if only a part of the eigenvalues is needed.
- *  \param ldq                        Leading dimension of q
- *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                 distributed number of matrix columns
- *  \param mpi_comm_rows              MPI-Communicator for rows
- *  \param mpi_comm_cols              MPI-Communicator for columns
- *  \param mpi_coll_all               MPI communicator for the total processor set
- *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param use_qr                     use QR decomposition 1 = yes, 0 = no
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_real_2stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR);
- /*! \brief C interface to solve the complex eigenvalue problem with 2-stage solver
- *
- *  \param  na                        Order of matrix a
- *  \param  nev                       Number of eigenvalues needed.
- *                                    The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
- *                                    Distribution is like in Scalapack.
- *                                    The full matrix must be set (not only one half like in scalapack).
- *  \param lda                        Leading dimension of a
- *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
- *  \param q                          On output: Eigenvectors of a
- *                                    Distribution is like in Scalapack.
- *                                    Must be always dimensioned to the full size (corresponding to (na,na))
- *                                    even if only a part of the eigenvalues is needed.
- *  \param ldq                        Leading dimension of q
- *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                 distributed number of matrix columns
- *  \param mpi_comm_rows              MPI-Communicator for rows
- *  \param mpi_comm_cols              MPI-Communicator for columns
- *  \param mpi_coll_all               MPI communicator for the total processor set
- *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param use_qr                     use QR decomposition 1 = yes, 0 = no
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_complex_2stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API);
diff -Nru elpa-2016.05.001/elpa/elpa_generic.h elpa-2019.11.001/elpa/elpa_generic.h
--- elpa-2016.05.001/elpa/elpa_generic.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/elpa/elpa_generic.h	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,228 @@
+#pragma once
+
+/*! \brief generic C method for elpa_set
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object for which a key/value pair should be set
+ *  \param  name    the name of the key
+ *  \param  value   integer/double value to be set for the key
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_set(e, name, value, error) _Generic((value), \
+                int: \
+                  elpa_set_integer, \
+                \
+                double: \
+                  elpa_set_double \
+        )(e, name, value, error)
+
+
+/*! \brief generic C method for elpa_get
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object for which a key/value pair should be queried
+ *  \param  name    the name of the key
+ *  \param  value   integer/double value to be queried
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_get(e, name, value, error) _Generic((value), \
+                int*: \
+                  elpa_get_integer, \
+                \
+                double*: \
+                  elpa_get_double \
+        )(e, name, value, error)
+
+
+/*! \brief generic C method for elpa_eigenvectors
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object, which defines the problem
+ *  \param  a       float/double float complex/double complex pointer to matrix a
+ *  \param  ev      on return: float/double pointer to eigenvalues
+ *  \param  q       on return: float/double float complex/double complex pointer to eigenvectors
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_eigenvectors(handle, a, ev, q, error) _Generic((a), \
+                double*: \
+                  elpa_eigenvectors_d, \
+                \
+                float*: \
+                  elpa_eigenvectors_f, \
+                \
+                double complex*: \
+                  elpa_eigenvectors_dc, \
+                \
+                float complex*: \
+                  elpa_eigenvectors_fc \
+        )(handle, a, ev, q, error)
+
+/*! \brief generic C method for elpa_skew_eigenvectors
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object, which defines the problem
+ *  \param  a       float/double float complex/double complex pointer to matrix a
+ *  \param  ev      on return: float/double pointer to eigenvalues
+ *  \param  q       on return: float/double float complex/double complex pointer to eigenvectors
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_skew_eigenvectors(handle, a, ev, q, error) _Generic((a), \
+                double*: \
+                  elpa_eigenvectors_d, \
+                \
+                float*: \
+                  elpa_eigenvectors_f, \
+        )(handle, a, ev, q, error)
+
+
+
+/*! \brief generic C method for elpa_generalized_eigenvectors
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object, which defines the problem
+ *  \param  a       float/double float complex/double complex pointer to matrix a
+ *  \param  b       float/double float complex/double complex pointer to matrix b
+ *  \param  ev      on return: float/double pointer to eigenvalues
+ *  \param  q       on return: float/double float complex/double complex pointer to eigenvectors
+ *  \param  is_already_decomposed   set to 1, if b already decomposed by previous call to elpa_generalized
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_generalized_eigenvectors(handle, a, b, ev, q, is_already_decomposed, error) _Generic((a), \
+                double*: \
+                  elpa_generalized_eigenvectors_d, \
+                \
+                float*: \
+                  elpa_generalized_eigenvectors_f, \
+                \
+                double complex*: \
+                  elpa_generalized_eigenvectors_dc, \
+                \
+                float complex*: \
+                  elpa_generalized_eigenvectors_fc \
+        )(handle, a, b, ev, q, is_already_decomposed, error)
+
+
+/*! \brief generic C method for elpa_eigenvalues
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object, which defines the problem
+ *  \param  a       float/double float complex/double complex pointer to matrix a
+ *  \param  ev      on return: float/double pointer to eigenvalues
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_eigenvalues(handle, a, ev, error) _Generic((a), \
+                double*: \
+                  elpa_eigenvalues_d, \
+                \
+                float*: \
+                  elpa_eigenvalues_f, \
+                \
+                double complex*: \
+                  elpa_eigenvalues_dc, \
+                \
+                float complex*: \
+                  elpa_eigenvalues_fc \
+        )(handle, a, ev, error)
+
+/*! \brief generic C method for elpa_skew_eigenvalues
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object, which defines the problem
+ *  \param  a       float/double float complex/double complex pointer to matrix a
+ *  \param  ev      on return: float/double pointer to eigenvalues
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_skew_eigenvalues(handle, a, ev, error) _Generic((a), \
+                double*: \
+                  elpa_eigenvalues_d, \
+                \
+                float*: \
+                  elpa_eigenvalues_f, \
+        )(handle, a, ev, error)
+
+
+/*  \brief generic C method for elpa_cholesky
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object, which defines the problem
+ *  \param  a       float/double float complex/double complex pointer to matrix a, for which
+ *                  the cholesky factorizaion will be computed
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_cholesky(handle, a, error) _Generic((a), \
+                double*: \
+                  elpa_cholesky_d, \
+                \
+                float*: \
+                  elpa_cholesky_f, \
+                \
+                double complex*: \
+                  elpa_cholesky_dc, \
+                \
+                float complex*: \
+                  elpa_cholesky_fc \
+        )(handle, a, error)
+
+
+/*! \brief generic C method for elpa_hermitian_multiply
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object, which defines the problem
+ *  \param  uplo_a  descriptor for matrix a
+ *  \param  uplo_c  descriptor for matrix c
+ *  \param  ncb     int
+ *  \param  a       float/double float complex/double complex pointer to matrix a
+ *  \param  b       float/double float complex/double complex pointer to matrix b
+ *  \param  nrows_b number of rows for matrix b
+ *  \param  ncols_b number of cols for matrix b
+ *  \param  c       float/double float complex/double complex pointer to matrix c
+ *  \param  nrows_c number of rows for matrix c
+ *  \param  ncols_c number of cols for matrix c
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_hermitian_multiply(handle, uplo_a, uplo_c, ncb, a, b, nrows_b, ncols_b, c, nrows_c, ncols_c, error) _Generic((a), \
+                double*: \
+                  elpa_hermitian_multiply_d, \
+                \
+                float*: \
+                  elpa_hermitian_multiply_f, \
+                \
+                double complex*: \
+                  elpa_hermitian_multiply_dc, \
+                \
+                float complex*: \
+                  elpa_hermitian_multiply_fc \
+        )(handle, a, error)
+
+
+/*! \brief generic C method for elpa_invert_triangular
+ *
+ *  \details
+ *  \param  handle  handle of the ELPA object, which defines the problem
+ *  \param  a       float/double float complex/double complex pointer to matrix a, which
+ *                  should be inverted
+ *  \param  error   on return the error code, which can be queried with elpa_strerr()
+ *  \result void
+ */
+#define elpa_invert_triangular(handle, a, error) _Generic((a), \
+                double*: \
+                  elpa_invert_trm_d, \
+                \
+                float*: \
+                  elpa_invert_trm_f, \
+                \
+                double complex*: \
+                  elpa_invert_trm_dc, \
+                \
+                float complex*: \
+                  elpa_invert_trm_fc \
+        )(handle, a, error)
diff -Nru elpa-2016.05.001/elpa/elpa.h elpa-2019.11.001/elpa/elpa.h
--- elpa-2016.05.001/elpa/elpa.h	2016-02-26 14:11:56.000000000 +0000
+++ elpa-2019.11.001/elpa/elpa.h	2019-12-19 09:47:44.000000000 +0000
@@ -1,2 +1,23 @@
-#include <elpa/elpa_kernel_constants.h>
+#ifndef ELPA_H
+#define ELPA_H
+
+#include <limits.h>
+#include <complex.h>
+
+#include <elpa/elpa_version.h>
+
+struct elpa_struct;
+typedef struct elpa_struct *elpa_t;
+
+struct elpa_autotune_struct;
+typedef struct elpa_autotune_struct *elpa_autotune_t;
+
+
+#include <elpa/elpa_constants.h>
+#include <elpa/elpa_generated_c_api.h>
 #include <elpa/elpa_generated.h>
+#include <elpa/elpa_generic.h>
+
+const char *elpa_strerr(int elpa_error);
+
+#endif
diff -Nru elpa-2016.05.001/elpa/elpa_kernel_constants.h elpa-2019.11.001/elpa/elpa_kernel_constants.h
--- elpa-2016.05.001/elpa/elpa_kernel_constants.h	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/elpa/elpa_kernel_constants.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,32 +0,0 @@
-#define ELPA2_REAL_KERNEL_GENERIC 1
-#define ELPA2_REAL_KERNEL_GENERIC_SIMPLE 2
-#define ELPA2_REAL_KERNEL_BGP 3
-#define ELPA2_REAL_KERNEL_BGQ 4
-#define ELPA2_REAL_KERNEL_SSE 5
-#define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
-#define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
-#define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
-#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
-#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
-#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
-#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
-#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
-#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
-
-#define ELPA2_NUMBER_OF_REAL_KERNELS 14
-
-
-#define ELPA2_COMPLEX_KERNEL_GENERIC 1
-#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
-#define ELPA2_COMPLEX_KERNEL_BGP 3
-#define ELPA2_COMPLEX_KERNEL_BGQ 4
-#define ELPA2_COMPLEX_KERNEL_SSE 5
-#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
-#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
-#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
-#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
-#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
-#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
-
-
-#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 11
diff -Nru elpa-2016.05.001/elpa/elpa_simd_constants.h elpa-2019.11.001/elpa/elpa_simd_constants.h
--- elpa-2016.05.001/elpa/elpa_simd_constants.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/elpa/elpa_simd_constants.h	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,13 @@
+#define CPU_MANUFACTURER 1
+#define GENERIC_INSTR  2
+#define BLUEGENE_INSTR 3
+#define SSE_INSTR 4
+#define AVX_INSTR 5
+#define AVX2_INSTR 6
+#define AVX512_INSTR 7
+#define NVIDIA_INSTR 8
+#define VSX_INSTR 9
+#define ARCH64_INSTR 10
+#define SPARC_INSTR 11
+
+#define NUMBER_OF_INSTR 12
diff -Nru elpa-2016.05.001/elpa/elpa_version.h.in elpa-2019.11.001/elpa/elpa_version.h.in
--- elpa-2016.05.001/elpa/elpa_version.h.in	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/elpa/elpa_version.h.in	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,3 @@
+#define ELPA_API_VERSION @CURRENT_API_VERSION@
+#define ELPA_AUTOTUNE_API_VERSION @CURRENT_AUTOTUNE_VERSION@
+
diff -Nru elpa-2016.05.001/elpa.pc.in elpa-2019.11.001/elpa.pc.in
--- elpa-2016.05.001/elpa.pc.in	2016-02-26 14:11:56.000000000 +0000
+++ elpa-2019.11.001/elpa.pc.in	2019-12-19 09:47:40.000000000 +0000
@@ -7,6 +7,6 @@
 Description: ELPA is a Fortran-based high-performance computational library for the (massively) parallel solution of symmetric or Hermitian, standard or generalized eigenvalue problems.
 Version: @PACKAGE_VERSION@
 URL: @PACKAGE_URL@
-Libs: -L${libdir} -lelpa@SUFFIX@ @LIBS@ @OPENMP_FCFLAGS@
+Libs: -L${libdir} -lelpa@SUFFIX@ @LIBS@ @SCALAPACK_LDFLAGS@ @OPENMP_FCFLAGS@
 Cflags: -I${includedir}/elpa@SUFFIX@-@PACKAGE_VERSION@ @OPENMP_CFLAGS@
-fcflags= -I${includedir}/elpa@SUFFIX@-@PACKAGE_VERSION@/modules @OPENMP_FCFLAGS@
+fcflags= -I${includedir}/elpa@SUFFIX@-@PACKAGE_VERSION@/modules @SCALAPACK_FCFLAGS@ @OPENMP_FCFLAGS@
diff -Nru elpa-2016.05.001/elpa.spec elpa-2019.11.001/elpa.spec
--- elpa-2016.05.001/elpa.spec	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/elpa.spec	2019-12-20 05:57:47.000000000 +0000
@@ -1,7 +1,7 @@
 #
 # spec file for package elpa
 #
-# Copyright (c) 2015 Lorenz Hüdepohl
+# Copyright (c) 2015 Lorenz Huedepohl
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -12,35 +12,33 @@
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
 
-%define so_version 4
+%define so_version 8
 
 # OpenMP support requires an MPI implementation with MPI_THREAD_MULTIPLE support,
-# which is only available for a sufficiently configured openmpi >= 1.8
-# Set to 0 to disable
-%define with_openmp 1
+# which is currently not provided at OpenSUSE
+%define with_openmp 0
 
 Name:           elpa
-Version:        2016.05.001
-Release:        2
+Version:        2019.11.001
+Release:        1
 Summary:        A massively parallel eigenvector solver
 License:        LGPL-3.0
 Group:          System/Libraries
 Url:            https://elpa.rzg.mpg.de/
 Source0:        https://elpa.mpcdf.mpg.de/html/Releases/%{version}/%{name}-%{version}.tar.gz
-Requires:       openmpi
-BuildRequires:  gcc-c++
-BuildRequires:  gcc-fortran
+BuildRequires:  c_compiler
+BuildRequires:  gcc-fortran >= 4.8
 BuildRequires:  openmpi-devel
-BuildRequires:  blas
+Requires:       openmpi
 BuildRequires:  blas-devel
-BuildRequires:  lapack
 BuildRequires:  lapack-devel
 BuildRequires:  pkg-config
+BuildRequires:  autoconf
+BuildRequires:  automake
+BuildRequires:  libtool
 
 %if %{defined fedora}
-BuildRequires:  scalapack-openmpi
 BuildRequires:  scalapack-openmpi-devel
-BuildRequires:  blacs-openmpi
 BuildRequires:  blacs-openmpi-devel
 BuildRequires:  environment-modules
 %endif
@@ -51,6 +49,7 @@
 
 # For make check, mpirun of openmpi needs an installed openssh
 BuildRequires:  openssh
+BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 
 %description
 A new efficient distributed parallel direct eigenvalue solver for
@@ -104,7 +103,6 @@
 Group:          Development/Libraries
 Requires:       %{name} = %{version}
 Requires:       openmpi
-Requires:       libstdc++-devel
 Requires:       lapack-devel
 Requires:       blas-devel
 Requires:       libscalapack2-openmpi-devel
@@ -150,7 +148,6 @@
 Group:          Development/Libraries
 Requires:       %{name}_openmp = %{version}
 Requires:       openmpi
-Requires:       libstdc++-devel
 Requires:       lapack-devel
 Requires:       blas-devel
 Requires:       libscalapack2-openmpi-devel
@@ -176,26 +173,79 @@
 %if %{defined fedora}
 module load mpi/openmpi-%{_arch}
 %endif
+%if %{defined suse_version}
+. %{_libdir}/mpi/gcc/openmpi/bin/mpivars.sh
+%endif
+
+if [ ! -e configure ] ; then
+        # It is possible to use the Open Build Server to automatically
+        # checkout from git directly, extract this spec file and set the
+        # version of the extracted spec file to some autogenerated
+        # string based on the date and git hash of the checkout commit.
+        # This line ensures that the spec file in the tar-ball has
+        # that same version, so that autogen will insert that version
+        # in the configure script
+        perl -pi -e "s/^Version:.*/Version: %{version}/" elpa.spec
+
+        ./autogen.sh
+fi
+
+# Normal build
 mkdir build
 pushd build
 %define _configure ../configure
-%configure --docdir=%{_docdir}/%{name}-%{version}
+
+%configure \
+%ifarch i386 i486 i586 i686 x86_64
+        CFLAGS="$CFLAGS -msse4.2" \
+        FCFLAGS="$FFLAGS $FCFLAGS -msse4.2" \
+%endif
+%ifnarch i386 i486 i586 i686 x86_64
+        --disable-sse \
+        --disable-sse-assembly \
+%endif
+        --disable-avx \
+        --disable-avx2 \
+        --docdir=%{_docdir}/%{name}-%{version} \
+        || { cat config.log; exit 1; }
+
 make %{?_smp_mflags} V=1
 popd
 
+
+# OpenMP build
 %if %{defined with_openmp}
 mkdir build_openmp
 pushd build_openmp
-%configure --docdir=%{_docdir}/%{name}_openmp-%{version} --enable-openmp
+
+%configure \
+%ifarch i386 i486 i586 i686 x86_64
+        CFLAGS="$CFLAGS -msse4.2" \
+        FCFLAGS="$FFLAGS $FCFLAGS -msse4.2" \
+%endif
+%ifnarch i386 i486 i586 i686 x86_64
+        --disable-sse \
+        --disable-sse-assembly \
+%endif
+        --disable-avx \
+        --disable-avx2 \
+        --docdir=%{_docdir}/%{name}_openmp-%{version} \
+        --enable-openmp \
+        || { cat config.log; exit 1; }
+
 make %{?_smp_mflags} V=1
 popd
-%endif
+
+%endif # OpenMP
 
 
 %check
 %if %{defined fedora}
 module load mpi/openmpi-%{_arch}
 %endif
+%if %{defined suse_version}
+. %{_libdir}/mpi/gcc/openmpi/bin/mpivars.sh
+%endif
 
 pushd build
 make check TEST_FLAGS="1500 50 16" || { echo "Tests failed: Content of ./test-suite.log:"; cat ./test-suite.log; echo; exit 1; }
@@ -229,19 +279,17 @@
 %files -n lib%{name}%{so_version}
 # See http://en.opensuse.org/openSUSE:Shared_library_packaging_policy
 # to explain this package's name
-%defattr(0755,root,root)
+%defattr(-,root,root)
 %{_libdir}/lib%{name}.so.*
-%doc
-%defattr(0644,root,root)
 %{_docdir}/%{name}-%{version}/*
 %dir %{_docdir}/%{name}-%{version}
 
 %files tools
 %attr(0755,root,root) %{_bindir}/elpa2_print_kernels
-%attr(0644,root,root) %_mandir/man1/elpa2_print_kernels.1.gz
+%attr(0644,root,root) %{_mandir}/man1/elpa2_print_kernels.1.gz
 
 %files devel
-%defattr(0644,root,root)
+%defattr(-,root,root)
 %{_libdir}/pkgconfig/%{name}-%{version}.pc
 %{_includedir}/%{name}-%{version}
 %{_libdir}/lib%{name}.so
@@ -249,32 +297,31 @@
 %_mandir/man3/*
 
 %files devel-static
-%defattr(0644,root,root)
+%defattr(-,root,root)
 %{_libdir}/lib%{name}.a
 
 %if %{defined with_openmp}
 
 %files -n lib%{name}_openmp%{so_version}
-%defattr(0755,root,root)
+%defattr(-,root,root)
 %{_libdir}/lib%{name}_openmp.so.*
-%doc
-%defattr(0644,root,root)
 %{_docdir}/%{name}_openmp-%{version}/*
 %dir %{_docdir}/%{name}_openmp-%{version}
 
 %files -n %{name}_openmp-tools
-%defattr(0755,root,root)
-%{_bindir}/elpa2_print_kernels_openmp
+%defattr(-,root,root)
+%attr(0755,root,root) %{_bindir}/elpa2_print_kernels_openmp
+
 
 %files -n %{name}_openmp-devel
-%defattr(0644,root,root)
+%defattr(-,root,root)
 %{_libdir}/pkgconfig/%{name}_openmp-%{version}.pc
 %{_includedir}/%{name}_openmp-%{version}
 %{_libdir}/lib%{name}_openmp.so
 %{_libdir}/lib%{name}_openmp.la
 
 %files -n %{name}_openmp-devel-static
-%defattr(0644,root,root)
+%defattr(-,root,root)
 %{_libdir}/lib%{name}_openmp.a
 
 %endif
diff -Nru elpa-2016.05.001/fdep/fortran_dependencies.mk elpa-2019.11.001/fdep/fortran_dependencies.mk
--- elpa-2016.05.001/fdep/fortran_dependencies.mk	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/fdep/fortran_dependencies.mk	2019-12-19 09:47:44.000000000 +0000
@@ -11,6 +11,9 @@
 _f90_verbose = $(_f90_verbose_$(V))
 _f90_verbose_ = $(_f90_verbose_$(AM_DEFAULT_VERBOSITY))
 _f90_verbose_0 = @echo "  $1";
+_f90_only_verbose = $(_f90_only_verbose_$(V))
+_f90_only_verbose_ = @
+_f90_only_verbose_0 = @
 _f90_targets = $(call translate_name,$(PROGRAMS) $(LTLIBRARIES))
 
 FORTRAN_CPP ?= cpp -P -traditional -Wall -Werror
@@ -81,14 +84,28 @@
 endef
 
 define is_clean
-$(if $(filter-out mostlyclean clean distclean maintainer-clean,$(MAKECMDGOALS)),0,1)
+$(if $(filter-out mostlyclean clean distclean maintainer-clean am--depfiles,$(MAKECMDGOALS)),0,1)
+endef
+
+define newline
+
+
 endef
 
 ifneq ($(call is_clean),1)
 include $(_f90_depfile)
 endif
+
+# $1 program
+define program_dependencies
+	$(_f90_only_verbose){ $(foreach argument,$(_$p_use_mods) $(_$p_def_mods) $(foreach l,$(call recursive_lib_deps,$p),$(_$l_use_mods) $(_$l_def_mods)),echo $(argument); ) true; } | \
+	$(top_srcdir)/fdep/fortran_dependencies.pl $p >> $@ || { rm $@; exit 1; }
+
+endef
+
 $(_f90_depfile): $(top_srcdir)/fdep/fortran_dependencies.pl $(foreach p,$(_f90_targets),$(_$p_use_mods) $(_$p_def_mods)) | $(foreach p,$(_f90_targets),$(_f90_depdir)/$p)
-	$(call _f90_verbose,F90 DEPS $@)echo > $@; $(foreach p,$(_f90_targets),$(top_srcdir)/fdep/fortran_dependencies.pl $p $(_$p_use_mods) $(_$p_def_mods) $(foreach l,$(call recursive_lib_deps,$p),$(_$l_use_mods) $(_$l_def_mods)) >> $@; )
+	$(call _f90_verbose,F90 DEPS $@)echo > $@;
+	$(foreach p,$(_f90_targets),$(call program_dependencies,$p))
 
 $(_f90_depdir):
 	@mkdir $@
diff -Nru elpa-2016.05.001/fdep/fortran_dependencies.pl elpa-2019.11.001/fdep/fortran_dependencies.pl
--- elpa-2016.05.001/fdep/fortran_dependencies.pl	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/fdep/fortran_dependencies.pl	2019-12-19 09:47:44.000000000 +0000
@@ -42,7 +42,8 @@
 
 my $target = shift;
 
-foreach my $file (@ARGV) {
+foreach my $file (<>) {
+	chomp($file);
 	if (exists $files{$file}) {
 		next;
 	} else {
diff -Nru elpa-2016.05.001/generated_headers.am elpa-2019.11.001/generated_headers.am
--- elpa-2016.05.001/generated_headers.am	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/generated_headers.am	2019-12-19 09:47:44.000000000 +0000
@@ -1,27 +1,67 @@
 define extract_interface
-	@echo "Generating $@...";
+	@echo "Extracting interface marked with '$1' from $@...";
 	@grep -h "^ *$1" $^ | sed 's/^ *$1//;' >> $@ || { rm $@; exit 1; }
 endef
 
-elpa test:
-	mkdir $@
+elpa test src:
+	@mkdir $@
+
+test/shared: | test
+	@mkdir $@
 
-test/shared_sources: | test
-	mkdir $@
+generated_headers =
 
+generated_headers += config-f90.h
 config-f90.h: config.h
 	@echo "Generating $@...";
 	@grep "^#define" $^ > $@ || { rm $@; exit 1; }
 
-elpa/elpa_generated.h: $(top_srcdir)/src/elpa_c_interface.F90 | elpa
+generated_headers += elpa/elpa_generated.h
+elpa/elpa_generated.h: $(top_srcdir)/src/elpa_impl.F90 \
+                       $(top_srcdir)/src/elpa_impl_math_template.F90 \
+                       $(top_srcdir)/src/elpa_api.F90 | elpa
+	@rm -f $@
 	$(call extract_interface,!c>)
+	$(call extract_interface,!c_o>)
+	$(call extract_interface,!c_no>)
 
-test/shared_sources/generated.h: $(wildcard $(top_srcdir)/test/shared_sources/*.F90) | test/shared_sources
+#if OPTIONAL_C_ERROR_ARGUMENT
+#
+#elpa/elpa_generated.h: $(top_srcdir)/src/elpa_impl.F90 \
+#                       $(top_srcdir)/src/elpa_impl_math_template.F90 \
+#                       $(top_srcdir)/src/elpa_api.F90 | elpa
+#	$(call extract_interface,!c_o>)
+#
+#else
+#elpa/elpa_generated.h: $(top_srcdir)/src/elpa_impl.F90 \
+#                       $(top_srcdir)/src/elpa_impl_math_template.F90 \
+#                       $(top_srcdir)/src/elpa_api.F90 | elpa
+#	$(call extract_interface,!c_no>)
+#endif
+
+generated_headers += test/shared/generated.h
+test/shared/generated.h: $(wildcard $(top_srcdir)/test/shared/*.*90) | test/shared
+	@rm -f $@
 	$(call extract_interface,!c>)
 
-elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c) $(wildcard $(top_srcdir)/src/elpa2_kernels/*.s) | elpa
+generated_headers += src/elpa_generated_fortran_interfaces.h
+src/elpa_generated_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/helpers/*.c $(top_srcdir)/src/elpa2/kernels/*.c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch] $(top_srcdir)/src/elpa_generalized/*.[ch])) | src
+	@rm -f $@
 	$(call extract_interface,!f>)
 	$(call extract_interface,#!f>)
 
-generated_headers= config-f90.h elpa/elpa_generated.h test/shared_sources/generated.h elpa/elpa_generated_fortran_interfaces.h
+generated_headers += src/elpa_generated_public_fortran_interfaces.h
+src/elpa_generated_public_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/*.[ch])) | src
+	@rm -f $@
+	$(call extract_interface,!pf>)
+
+generated_headers += src/fortran_constants.F90
+src/fortran_constants.F90: $(top_srcdir)/src/fortran_constants.h | src
+	@$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -I. $< -o $@_ || { rm -f $@; exit 1; }
+	@awk '/!ELPA_C_DEFINE/ {gsub(/!ELPA_C_DEFINE/, "\n"); gsub(/NEWLINE/, "\n"); print;}' < $@_ > $@ || { rm -f $@; exit 1; }
+	@rm $@_
+
+
 generated-headers: $(generated_headers)
+
+# vim: syntax=make
diff -Nru elpa-2016.05.001/INSTALL.md elpa-2019.11.001/INSTALL.md
--- elpa-2016.05.001/INSTALL.md	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/INSTALL.md	2019-12-20 05:57:47.000000000 +0000
@@ -1,12 +1,25 @@
-# Installation guide #
+# Installation guide for the *ELPA* library#
 
-## Preamle ##
+## Preamble ##
 
-This file provides documentation on how to build the *ELPA* library in **version ELPA-2016.05.001**.
-Although most of the documentation is generic to any *ELPA* release, some configure options
-described in this document might be specific to the above mentioned version of *ELPA*.
+This file provides documentation on how to build the *ELPA* library in **version ELPA-2019.11.001**.
+With release of **version ELPA-2017.05.001** the build process has been significantly simplified,
+which makes it easier to install the *ELPA* library.
 
-## How to install ELPA ##
+The old, obsolete legacy API will be deprecated in the future !
+Allready now, all new features of ELPA are only available with the new API. Thus, there
+is no reason to keep the legacy API arround for too long.
+
+The release ELPA 2018.11.001 was the last release, where the legacy API has been
+enabled by default (and can be disabled at build time).
+With release ELPA 2019.05.001 the legacy API is disabled by default, however,
+can be still switched on at build time.
+With this release ELPA 2019.11.001, the legacy API will be deprecated and not supported anymore.
+
+The release of ELPA 2019.11.001 does change the API and ABI compared to the release 2019.05.002, since
+the legacy API has been droped.
+
+## How to install *ELPA* ##
 
 First of all, if you do not want to build *ELPA* yourself, and you run Linux,
 it is worth having a look at the [*ELPA* webpage*] (http://elpa.mpcdf.mpg.de)
@@ -18,70 +31,227 @@
 please note that *ELPA* is shipped with a typical "configure" and "make"
 autotools procedure. This is the **only supported way** how to build and install *ELPA*.
 
+
 If you obtained *ELPA* from the official git repository, you will not find
-the needed configure script! Please look at the "**INSTALL_FROM_GIT_VERSION**" file
-for the documentation how to proceed.
+the needed configure script! You will have to create the configure scipt with autoconf.
 
 
-## (A): Installing ELPA as library with configure ##
+## (A): Installing *ELPA* as library with configure ##
 
 *ELPA* can be installed with the build steps
 - configure
 - make
-- make check
+- make check   | or make check CHECK_LEVEL=extended
 - make install
 
 Please look at configure --help for all available options.
 
-### Setting of MPI compiler and libraries ###
+An excerpt of the most important (*ELPA* specific) options reads as follows:
+
+| configure option                     | description                                           |
+|:------------------------------------ |:----------------------------------------------------- |
+|  --enable-legacy-interface           | build legacy API, will not be build as default        |
+|  --enable-optional-argument-in-C-API | treat error arguments in C-API as optional            |
+|  --enable-openmp                     | use OpenMP threading, default no.                     |
+|  --enable-redirect                   | for ELPA test programs, allow redirection of <br> stdout/stderr per MPI taks in a file <br> (useful for timing), default no. |
+|  --enable-single-precision           | build with single precision version                   |
+|  --disable-timings                   | more detailed timing, default yes <br> **If disabled some features like autotune will <br> not work anymmore !** |
+|  --disable-band-to-full-blocking     | build ELPA2 with blocking in band_to_full <br> (default:enabled) |
+|  --disable-mpi-module                | do not use the Fortran MPI module, <br> get interfaces by 'include "mpif.h') |
+|  --disable-generic                   | do not build GENERIC kernels, default: enabled        |
+|  --enable-sparc64                    | do not build SPARC64 kernels, default: disabled        |
+|  --disable-sse                       | do not build SSE kernels, default: enabled            |
+|  --disable-sse-assembly              | do not build SSE_ASSEMBLY kernels, default: enabled   |
+|  --disable-avx                       | do not build AVX kernels, default: enabled            |
+|  --disable-avx2                      | do not build AVX2 kernels, default: enabled           |
+|  --enable-avx512                     | build AVX512 kernels, default: disabled               |
+|  --enable-gpu                        | build GPU kernels, default: disabled                  |
+|  --enable-bgp                        | build BGP kernels, default: disabled                  |
+|  --enable-bgq                        | build BGQ kernels, default: disabled                  |
+|  --with-mpi=[yes|no]                 | compile with MPI. Default: yes                        |
+|  --with-cuda-path=PATH               | prefix where CUDA is installed [default=auto]         |
+|  --with-cuda-sdk-path=PATH           | prefix where CUDA SDK is installed [default=auto]     |
+|  --with-GPU-compute-capability=VALUE | use compute capability VALUE for GPU version, <br> default: "sm_35" |
+|  --with-fixed-real-kernel=KERNEL     | compile with only a single specific real kernel.      |
+|  --with-fixed-complex-kernel=KERNEL  | compile with only a single specific complex kernel.   |
+|  --with-gpu-support-only             | Compile and always use the GPU version                |
+|  --with-likwid=[yes|no|PATH]         | use the likwid tool to measure performance (has an performance impact!), default: no |
+|  --with-default-real-kernel=KERNEL   | set the real kernel KERNEL as default                 |
+|  --with-default-complex-kernel=KERNEL| set the compplex kernel KERNEL as default             |
+|  --enable-scalapack-tests            | build SCALAPACK test cases for performance <br> omparison, needs MPI, default no. |
+|  --enable-autotuning                 | enables autotuning functionality, default yes         |
+|  --enable-c-tests                    | enables the C tests for elpa, default yes             |
+|  --disable-assumed-size              | do NOT use assumed-size Fortran arrays. default use   |
+|  --enable-scalapack-tests            | build also ScalaPack tests for performance comparison; needs MPI |
+|  --disable-Fortran2008-features      | disable Fortran 2008 if compiler does not support it  |
+|  --enable-pyhton                     | build and install python wrapper, default no          |
+|  --enable-python-tests               | enable python tests, default no.                      |
+|  --enable-skew-symmetric-support     | enable support for real valued skew-symmetric matrices |
+|  --enable-store-build-config         | stores the build config in the library object |
+|  --64bit-integer-math-support        | assumes that BLAS/LAPACK/SCALAPACK use 64bit integers (experimentatl) |
+|  --64bit-integer-mpi-support         | assumes that MPI uses 64bit integers (experimental) |
+|  --heterogenous-cluster-support      | allows ELPA to run on clusters of nodes with different Intel CPUs (experimental) |
+
+We recommend that you do not build ELPA in its main directory but that you use it
+in a sub-directory:
+
+mkdir build
+cd build
+
+../configure [with all options needed for your system, see below]
+
+In this way, you have a clean separation between original *ELPA* source files and the compiled
+object files
+
+Please note, that it is necessary to set the **compiler options** like optimisation flags etc.
+for the Fortran and C part.
+For example sth. like this is a usual way: ./configure FCFLAGS="-O2 -mavx" CFLAGS="-O2 -mavx"
+For details, please have a look at the documentation for the compilers of your choice.
+
+**Note** that most kernels can only be build if the correct compiler flags for this kernel (e.g. AVX-512)
+have been enabled.
+
+
+### Choice of building with or without MPI ###
+
+It is possible to build the *ELPA* library with or without MPI support.
+
+Normally *ELPA* is build with MPI, in order to speed-up calculations by using distributed
+parallelisation over several nodes. This is, however, only reasonably if the programs
+calling the *ELPA* library are already MPI parallized, and *ELPA* can use the same
+block-cyclic distribution of data as in the calling program.
+
+Programs which do not support MPI parallelisation can still make use of the *ELPA* library if it
+has also been build without MPI support.
+
+If you want to build *ELPA* with MPI support, please have a look at "A) Setting of MPI compiler and libraries".
+For builds without MPI support, please have a look at "B) Building *ELPA* without MPI support".
+**NOTE** that if *ELPA* is build without MPI support, it will be serial unless the OpenMP parallization is
+explicitely enabled.
 
-In the standard case *ELPA* need a MPI compiler and MPI libraries. The configure script
+Please note, that it is absolutely supported that both versions of the *ELPA* library are build
+and installed in the same directory.
+
+#### A) Setting of MPI compiler and libraries ####
+
+In the standard case *ELPA* needs a MPI compiler and MPI libraries. The configure script
 will try to set this by itself. If, however, on the build system the compiler wrapper
 cannot automatically found, it is recommended to set it by hand with a variable, e.g.
 
 configure FC=mpif90
 
-### Hybrid MPI/OpenMP library build ###
-The *ELPA* library can be build to support hybrid MPI/OpenMP support. To do this the
-"--enable-openmp" configure option should be said. If also a hybrid version of *ELPA*
-is wanted, it is recommended to build to version of *ELPA*: one with pure MPI and
-a hybrid version. They can be both installed in the same path, since the have different
-so library names.
+In some cases, on your system different MPI libraries and compilers are installed. Then it might happen
+that during the build step an error like "no module mpi" or "cannot open module mpi" is given.
+You can disable that the  *ELPA* library uses a MPI modules (and instead uses MPI header files) by
+adding
+
+--disable-mpi-module
+
+to the configure call.
+
+Please continue reading at "C) Enabling GPU support"
+
+
+#### B) Building *ELPA* without MPI support ####
+
+If you want to build *ELPA* without MPI support, add
+
+--with-mpi=no
+
+to your configure call.
+
+You have to specify which compilers should be used with e.g.,
+
+configure FC=gfortran --with-mpi=no
+
+**DO NOT specify a MPI compiler here!**
+
+Note, that the installed *ELPA* library files will be suffixed with
+"_onenode", in order to discriminate this build from possible ones with MPI.
+
+
+Please continue reading at "C) Enabling GPU support"
+
+### Enabling GPU support ###
+
+The *ELPA* library can be build with GPU support. If *ELPA* is build with GPU
+support, users can choose at RUNTIME, whether to use the GPU version or not.
+
+For GPU support, NVIDIA GPUs with compute capability >= 3.5 are needed.
+
+GPU support is set with
+
+--enable-gpu
+
+It might be necessary to also set the options (please see configure --help)
+
+--with-cuda-path
+--with-cuda-sdk-path
+--with-GPU-compute-capability
+
+Please continue reading at "D) Enabling OpenMP support".
+
+
+### Enabling OpenMP support ###
+
+The *ELPA* library can be build with OpenMP support. This can be support of hybrid
+MPI/OpenMP parallelization, since *ELPA* is build with MPI support (see A ) or only
+shared-memory parallization, since *ELPA* is build without MPI support (see B).
+
+To enable OpenMP support, add
+
+--enable-openmp
+
+as configure option.
+
+Note that as in case with/without MPI, you can also build and install versions of *ELPA*
+with/without OpenMP support at the same time.
+
+However, the GPU choice at runtime is not compatible with OpenMP support.
+
+Please continue reading at "E) Standard libraries in default installation paths".
+
 
-### Standard libraries in default installation paths###
+### Standard libraries in default installation paths ###
 
 In order to build the *ELPA* library, some (depending on the settings during the
-configure step, see below) libraries are needed.
+configure step) libraries are needed.
 
 Typically these are:
-  - Basic Linear Algebra Subroutines (BLAS)
-  - Lapack routines
-  - Basic Linear Algebra Communication Subroutines (BLACS)
-  - Scalapack routines
-  - a working MPI library
+  - Basic Linear Algebra Subroutines (BLAS)                   (always needed)
+  - Lapack routines                                           (always needed)
+  - Basic Linear Algebra Communication Subroutines (BLACS)    (only needed if MPI support was set)
+  - Scalapack routines                                        (only needed if MPI support was set)
+  - a working MPI library                                     (only needed if MPI support was set)
+  - a working OpenMP library                                  (only needed if OpenMP support was set)
+  - a working CUDA/cublas library                             (only needed if GPU support was set)
 
 If the needed library are installed on the build system in standard paths (e.g. /usr/lib64)
-the in most cases the *ELPA* configure step will recognize the needed libraries
+in the most cases the *ELPA* configure step will recognize the needed libraries
 automatically. No setting of any library paths should be necessary.
 
+If your configure steps finish succcessfully, please continue at "G) Choice of ELPA2 compute kernels".
+If your configure step aborts, or you want to use libraries in non standard paths please continue at
+"F) Non standard paths or non standard libraries".
+
 ### Non standard paths or non standard libraries ###
 
 If standard libraries are on the build system either installed in non standard paths, or
 special non standard libraries (e.g. *Intel's MKL*) should be used, it might be necessary
-to specify the appropriate link-line with the **SCALAPACK_LDFLAGS** and **SCALAPACK_FCFLAGS** 
+to specify the appropriate link-line with the **SCALAPACK_LDFLAGS** and **SCALAPACK_FCFLAGS**
 variables.
 
-For example, due to performance reasons it might be benefical to use the *BLAS*, *BLACS*, *LAPACK*, 
+For example, due to performance reasons it might be benefical to use the *BLAS*, *BLACS*, *LAPACK*,
 and *SCALAPACK* implementation from *Intel's MKL* library.
 
-Togehter with the Intel Fortran Compiler the call to configure might then look like:
+Together with the Intel Fortran Compiler the call to configure might then look like:
 
 configure SCALAPACK_LDFLAGS="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential \
                              -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" \
 	  SCALAPACK_FCFLAGS="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential \
 	                      -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKL_HOME/include/intel64/lp64"
 
-and for *INTEL MKL* togehter with *GNU GFORTRAN* :
+and for *INTEL MKL* together with *GNU GFORTRAN* :
 
 configure SCALAPACK_LDFLAGS="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_gf_lp64 -lmkl_sequential \
                              -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" \
@@ -90,29 +260,119 @@
 
 
 Please, for the correct link-line refer to the documentation of the correspondig library. In case of *Intel's MKL* we
-sugest the [Intel Math Kernel Library Link Line Advisor] (https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor).
+suggest the [Intel Math Kernel Library Link Line Advisor] (https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor).
 
 
 ### Choice of ELPA2 compute kernels ###
 
-In the default the configure script tries to configure and build all ELPA2 compute kernels which are available for
-the architecture. Then the specific kernel can be chosen at run-time via the api or an environment variable (see
-the **USERS_GUIDE** for details).
+ELPA 2stage can be used with different implementations of compute intensive kernels, which are architecture dependent.
+Some kernels (all for x86_64 architectures) are enabled by default (and must be disabled if you do not want them),
+others are disabled by default and must be enabled if they are wanted.
+
+One can enable "kernel classes" by setting e.g.
+
+--enable-avx2 
+
+
+This will try to build all the AVX2 kernels. Please see configure --help for all options
+
+With
+
+--disable-avx2
+
+one chan choose not to build the AVX2 kernels.
+
+
+During the configure step all possible kernels will be printed, and whether they will be enabled or not.
+
+It is possible to build *ELPA* with as many kernels as desired, the user can then choose at runtime which
+kernels should be used.
 
 It this is not desired, it is possible to build *ELPA* with only one (not necessary the same) kernel for the
-real and complex valued case, respectively. This can be done with the "--with-real-..-kernel-only" and
-"--with-complex-..-kernel-only" configure options. For details please do a "configure --help"
+real and complex valued case, respectively. This can be done with the "--with-fixed-real-kernel=NAME" or
+"--with-fixed-complex-kernel=NAME" configure options. For details please do a "configure --help"
+
+#### Cross compilation ####
+
+The ELPA library does _not_ supports cross-compilation by itself, i.e. compilation of the ELPA library on an architecture wich is not
+identical than the architecture ELPA should be used on.
+
+Whenever a cross-compilation situation might occur, great care has to be taken during the build process by the user.
+
+At the moment we see two potential pitfalls:
+
+1.) The "build architecure" is inferior to the "target" architecture (w.r.t. the instructions sets)
+
+In this case, at the moment, the ELPA library can only be build with instructions sets supported on the build
+system. All later instruction sets will _not_ be used in the compilation. This case might lead to less optimal
+performance compared to the case that ELPA is build directly on the target system.
+
+For example, if the "build architecture" consists of an HASWELL node (supporting up to Intel's AVX2 instruction set) and the 
+"target architecture" is a Skylake node (supporting Intel's AVX-512 instruction set) than the AVX-512 kernels can not be build
+This will lead to a performance degradation on the Skylake nodes, but is otherwise harmless (no chrashes).
+
+
+2.) The "build architecure" is superior to the "target" architecture (w.r.t. the instructions sets)
 
-### No MPI, one node shared-memory version of ELPA ###
+This case is a critical one, since ELPA will by default build with instructions sets which are not supported on the target
+system. This will lead to crashes, if during build the user does not take care to solve this issue.
+
+For example, if the "build architecture" supports Intels' AVX-2 instruction set and the 
+"target architecture" does only support Intel's AVX instruction set, then by default ELPA will be build with AVX-2 instruction set
+and this will also be used at runtime (since it improves the performance). However, at the moment, since the target system does not support
+AVX-2 instructions this will lead to a crash.
+
+One can avoid this unfortunate situation by disabling instructions set which are _not_ supported on the target system.
+In the case above, setting
+
+--disable-avx2
+
+during build, will remdy this problem.
 
-Since release 2016.05.001 it is possible to build *ELPA* without any MPI support. This version can be used
-by applications, which do not have any MPI parallelisation. To set this version, use the
-"--with-mpi=0" configure flag. It is strongly recommmended to also set the "--enable-openmp"
-option, otherwise no parallelisation whatsoever will be present.
 
 ### Doxygen documentation ###
 A doxygen documentation can be created with the "--enable-doxygen-doc" configure option
 
+### Some examples ###
+
+#### Intel cores supporting AVX2 (Hasell and newer) ####
+
+We recommend that you build ELPA with the Intel compiler (if available) for the Fortran part, but
+with GNU compiler for the C part.
+
+1. Building with Intel Fortran compiler and GNU C compiler:
+
+Remarks:
+  - you have to know the name of the Intel Fortran compiler wrapper
+  - you do not have to specify a C compiler (with CC); GNU C compiler is recognized automatically
+  - you should specify compiler flags for Intel Fortran compiler; in the example only "-O3 -xAVX2" is set
+  - you should be careful with the CFLAGS, the example shows typical flags
+
+FC=mpi_wrapper_for_intel_Fortran_compiler CC=mpi_wrapper_for_gnu_C_compiler ./configure FCFLAGS="-O3 -xAVX2" CFLAGS="-O3 -march=native -mavx2 -mfma -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize" --enable-option-checking=fatal SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread " SCALAPACK_FCFLAGS="-I$MKL_HOME/include/intel64/lp64"
+
+
+2. Building with GNU Fortran compiler and GNU C compiler:
+
+Remarks: 
+  - you have to know the name of the GNU Fortran compiler wrapper
+  - you DO have to specify a C compiler (with CC); GNU C compiler is recognized automatically
+  - you should specify compiler flags for GNU Fortran compiler; in the example only "-O3 -march=native -mavx2 -mfma" is set
+  - you should be careful with the CFLAGS, the example shows typical flags
+
+FC=mpi_wrapper_for_gnu_Fortran_compiler CC=mpi_wrapper_for_gnu_C_compiler ./configure FCFLAGS="-O3 -march=native -mavx2 -mfma" CFLAGS="-O3 -march=native -mavx2 -mfma  -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize" --enable-option-checking=fatal SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread " SCALAPACK_FCFLAGS="-I$MKL_HOME/include/intel64/lp64"
+
+
+2. Building with Intel Fortran compiler and Intel C compiler:
+
+Remarks:
+  - you have to know the name of the Intel Fortran compiler wrapper
+  - you have to specify the Intel C compiler
+  - you should specify compiler flags for Intel Fortran compiler; in the example only "-O3 -xAVX2" is set
+  - you should be careful with the CFLAGS, the example shows typical flags
+
+FC=mpi_wrapper_for_intel_Fortran_compiler CC=mpi_wrapper_for_intel_C_compiler ./configure FCFLAGS="-O3 -xAVX2" CFLAGS="-O3 -xAVX2" --enable-option-checking=fatal SCALAPACK_LDFLAGS="-L$MKLROOT/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread " SCALAPACK_FCFLAGS="-I$MKL_HOME/include/intel64/lp64"
+
+
 
 
 
diff -Nru elpa-2016.05.001/install-sh elpa-2019.11.001/install-sh
--- elpa-2016.05.001/install-sh	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/install-sh	2019-12-21 16:29:46.000000000 +0000
@@ -1,7 +1,7 @@
 #!/bin/sh
 # install - install a program, script, or datafile
 
-scriptversion=2013-12-25.23; # UTC
+scriptversion=2018-03-11.20; # UTC
 
 # This originates from X11R5 (mit/util/scripts/install.sh), which was
 # later released in X11R6 (xc/config/util/install.sh) with the
@@ -271,15 +271,18 @@
     fi
     dst=$dst_arg
 
-    # If destination is a directory, append the input filename; won't work
-    # if double slashes aren't ignored.
+    # If destination is a directory, append the input filename.
     if test -d "$dst"; then
       if test "$is_target_a_directory" = never; then
         echo "$0: $dst_arg: Is a directory" >&2
         exit 1
       fi
       dstdir=$dst
-      dst=$dstdir/`basename "$src"`
+      dstbase=`basename "$src"`
+      case $dst in
+	*/) dst=$dst$dstbase;;
+	*)  dst=$dst/$dstbase;;
+      esac
       dstdir_status=0
     else
       dstdir=`dirname "$dst"`
@@ -288,6 +291,11 @@
     fi
   fi
 
+  case $dstdir in
+    */) dstdirslash=$dstdir;;
+    *)  dstdirslash=$dstdir/;;
+  esac
+
   obsolete_mkdir_used=false
 
   if test $dstdir_status != 0; then
@@ -324,34 +332,43 @@
             # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
             ;;
           *)
+            # Note that $RANDOM variable is not portable (e.g. dash);  Use it
+            # here however when possible just to lower collision chance.
             tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
-            trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
 
+            trap 'ret=$?; rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" 2>/dev/null; exit $ret' 0
+
+            # Because "mkdir -p" follows existing symlinks and we likely work
+            # directly in world-writeable /tmp, make sure that the '$tmpdir'
+            # directory is successfully created first before we actually test
+            # 'mkdir -p' feature.
             if (umask $mkdir_umask &&
-                exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+                $mkdirprog $mkdir_mode "$tmpdir" &&
+                exec $mkdirprog $mkdir_mode -p -- "$tmpdir/a/b") >/dev/null 2>&1
             then
               if test -z "$dir_arg" || {
                    # Check for POSIX incompatibilities with -m.
                    # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
                    # other-writable bit of parent directory when it shouldn't.
                    # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
-                   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+                   test_tmpdir="$tmpdir/a"
+                   ls_ld_tmpdir=`ls -ld "$test_tmpdir"`
                    case $ls_ld_tmpdir in
                      d????-?r-*) different_mode=700;;
                      d????-?--*) different_mode=755;;
                      *) false;;
                    esac &&
-                   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
-                     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+                   $mkdirprog -m$different_mode -p -- "$test_tmpdir" && {
+                     ls_ld_tmpdir_1=`ls -ld "$test_tmpdir"`
                      test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
                    }
                  }
               then posix_mkdir=:
               fi
-              rmdir "$tmpdir/d" "$tmpdir"
+              rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir"
             else
               # Remove any dirs left behind by ancient mkdir implementations.
-              rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+              rmdir ./$mkdir_mode ./-p ./-- "$tmpdir" 2>/dev/null
             fi
             trap '' 0;;
         esac;;
@@ -427,8 +444,8 @@
   else
 
     # Make a couple of temp file names in the proper directory.
-    dsttmp=$dstdir/_inst.$$_
-    rmtmp=$dstdir/_rm.$$_
+    dsttmp=${dstdirslash}_inst.$$_
+    rmtmp=${dstdirslash}_rm.$$_
 
     # Trap to clean up those temp files at exit.
     trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
@@ -493,9 +510,9 @@
 done
 
 # Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "scriptversion="
 # time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
+# time-stamp-time-zone: "UTC0"
 # time-stamp-end: "; # UTC"
 # End:
diff -Nru elpa-2016.05.001/m4/ax_elpa_openmp.m4 elpa-2019.11.001/m4/ax_elpa_openmp.m4
--- elpa-2016.05.001/m4/ax_elpa_openmp.m4	2016-02-26 14:11:56.000000000 +0000
+++ elpa-2019.11.001/m4/ax_elpa_openmp.m4	2019-12-19 09:47:43.000000000 +0000
@@ -69,6 +69,7 @@
 	  dnl   GCC >= 4.2           -fopenmp
 	  dnl   SunPRO C             -xopenmp
 	  dnl   Intel C              -openmp
+	  dnl   Intel > 14.0         -qopenmp
 	  dnl   SGI C, PGI C         -mp
 	  dnl   Tru64 Compaq C       -omp
 	  dnl   IBM C (AIX, Linux)   -qsmp=omp
@@ -77,7 +78,7 @@
 	  dnl will fail (since we know that it failed without the option),
 	  dnl therefore the loop will continue searching for an option, and
 	  dnl no output file called 'penmp' or 'mp' is created.
-	  for ac_option in -openmp -fopenmp -xopenmp -mp -omp -qsmp=omp; do
+	  for ac_option in -fopenmp -qopenmp -xopenmp -mp -omp -qsmp=omp -openmp; do
 	    ac_save_[]_AC_LANG_PREFIX[]FLAGS=$[]_AC_LANG_PREFIX[]FLAGS
 	    _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $ac_option"
 	    AC_LINK_IFELSE([AC_LANG_SOURCE([_AX_ELPA_LANG_OPENMP])],
diff -Nru elpa-2016.05.001/m4/ax_elpa_specific_kernels.m4 elpa-2019.11.001/m4/ax_elpa_specific_kernels.m4
--- elpa-2016.05.001/m4/ax_elpa_specific_kernels.m4	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/m4/ax_elpa_specific_kernels.m4	1970-01-01 00:00:00.000000000 +0000
@@ -1,208 +0,0 @@
-
-dnl macro for testing whether the user wanted to compile only with one
-dnl specific real kernel
-
-dnl usage: DEFINE_OPTION([real-generic-kernel-only],[generic-kernel],[with_real_generic_kernel],[install_real_generic])
-
-AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[
-  AC_ARG_WITH([$1],
-               AS_HELP_STRING([--with-$1],
-                              [only compile $2 for real case]),
-              [with_option=yes],[with_option=no])
-
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_real_kernel}" = x"no" ; then
-
-    dnl make sure that all the other kernels are unset
-    install_real_generic=no
-    install_real_generic_simple=no
-    install_real_sse_assembly=no
-    install_real_bgp=no
-    install_real_bgq=no
-    install_real_sse_block2=no
-    install_real_sse_block4=no
-    install_real_sse_block6=no
-    install_real_avx_block2=no
-    install_real_avx_block4=no
-    install_real_avx_block6=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-#    install_gpu=no
-
-    use_specific_real_kernel=yes
-    dnl now set the specific kernel
-    $3=yes
-    dnl take care of some dependencies
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-      AC_MSG_NOTICE([$1 set. Also sse_block2 is needed])
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-      AC_MSG_NOTICE([$1 set. Also avx_block2 is needed])
-      install_real_avx_block2=yes
-    fi
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-      AC_MSG_NOTICE([$1 set. Also sse_block2 is needed])
-      AC_MSG_NOTICE([$1 set. Also sse_block4 is needed])
-      install_real_sse_block4=yes
-      install_real_sse_block2=yes
-    fi
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-      AC_MSG_NOTICE([$1 set. Also avx_block2 is needed])
-      AC_MSG_NOTICE([$1 set. Also avx_block4 is needed])
-      install_real_avx_block4=yes
-      install_real_avx_block2=yes
-    fi
-
-    dnl in case of SSE or AVX make sure that we can compile the choosen kernel
-    if test x"${install_real_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     fi
-    fi
-
-    if test x"${install_real_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_sse=yes
-     fi
-    fi
-
-    if test x"${install_real_sse_block4}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_sse=yes
-     fi
-    fi
-
-    if test x"${install_real_sse_block6}" = x"yes" ; then
-     if test x"${can_compile_sse_inrinsics}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_sse=yes
-     fi
-    fi
-
-    if test x"${install_real_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    if test x"${install_real_avx_block4}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    if test x"${install_real_avx_block6}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    AC_MSG_NOTICE([$1 will be the only compiled kernel for real case])
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel])
-#    fi
-   else
-    AC_MSG_FAILURE([$1 failed; A specific kernel for real case has already been defined before!])
-   fi
-  fi
-])
-
-
-AC_DEFUN([DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL],[
-  AC_ARG_WITH([$1],
-                 AS_HELP_STRING([--with-$1],
-                                [only compile $2 for complex case]),
-              [with_option=yes],[with_option=no])
-
-   if test x"${with_option}" = x"yes" ; then
-    if test x"${use_specific_complex_kernel}" = x"no" ; then
-
-    dnl make sure that all the other kernels are unset
-    install_complex_generic=no
-    install_complex_generic_simple=no
-    install_complex_sse_assembly=no
-    install_complex_bgp=no
-    install_complex_bgq=no
-    install_complex_sse_block1=no
-    install_complex_sse_block2=no
-    install_complex_avx_block1=no
-    install_complex_avx_block2=no
-    want_sse=no
-    want_avx=no
-    want_avx2=no
-
-#    install_gpu=no
-    use_specific_complex_kernel=yes
-    dnl now set the specific kernel
-    $3=yes
-    dnl take care of some dependencies
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-      install_complex_sse_block1=yes
-    fi
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-      install_complex_avx_block1=yes
-    fi
-
-    dnl in case of SSE or AVX make sure that we can compile the choosen kernel
-    if test x"${install_complex_sse_assembly}" = x"yes" ; then
-     if test x"${can_compile_sse_assembly}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     fi
-    fi
-
-    if test x"${install_complex_sse_block1}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_sse=yes
-     fi
-    fi
-
-    if test x"${install_complex_sse_block2}" = x"yes" ; then
-     if test x"${can_compile_sse_intrinsics}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_sse=yes
-     fi
-    fi
-    if test x"${install_complex_avx_block1}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    if test x"${install_complex_avx_block2}" = x"yes" ; then
-     if test x"${can_compile_avx}" = x"no" ; then
-       AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
-     else
-       want_avx=yes
-     fi
-    fi
-
-    AC_MSG_NOTICE([$1 will be the only compiled kernel for complex case])
-#    if test x"${want_gpu}" = x"yes" ; then
-#      AC_MSG_WARN([At the moment this disables GPU support!])
-#      AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel])
-#    fi
-   else
-    AC_MSG_FAILURE([$1 failed; A specific kernel for complex case has already been defined before!])
-   fi
-  fi
-])
-
diff -Nru elpa-2016.05.001/m4/ax_ext.m4 elpa-2019.11.001/m4/ax_ext.m4
--- elpa-2016.05.001/m4/ax_ext.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/m4/ax_ext.m4	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,329 @@
+# ===========================================================================
+#          https://www.gnu.org/software/autoconf-archive/ax_ext.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_EXT
+#
+# DESCRIPTION
+#
+#   Find supported SIMD extensions by requesting cpuid. When a SIMD
+#   extension is found, the -m"simdextensionname" is added to SIMD_FLAGS if
+#   compiler supports it. For example, if "sse2" is available then "-msse2"
+#   is added to SIMD_FLAGS.
+#
+#   Find other supported CPU extensions by requesting cpuid. When a
+#   processor extension is found, the -m"extensionname" is added to
+#   CPUEXT_FLAGS if compiler supports it. For example, if "bmi2" is
+#   available then "-mbmi2" is added to CPUEXT_FLAGS.
+#
+#   This macro calls:
+#
+#     AC_SUBST(SIMD_FLAGS)
+#     AC_SUBST(CPUEXT_FLAGS)
+#
+#   And defines:
+#
+#     HAVE_RDRND / HAVE_BMI1 / HAVE_BMI2 / HAVE_ADX / HAVE_MPX
+#     HAVE_PREFETCHWT1 / HAVE_ABM / HAVE_MMX / HAVE_SSE / HAVE_SSE2
+#     HAVE_SSE3 / HAVE_SSSE3 / HAVE_SSE4_1 / HAVE_SSE4_2 / HAVE_SSE4a
+#     HAVE_SHA / HAVE_AES / HAVE_AVX / HAVE_FMA3 / HAVE_FMA4 / HAVE_XOP
+#     HAVE_AVX2 / HAVE_AVX512_F / HAVE_AVX512_CD / HAVE_AVX512_PF
+#     HAVE_AVX512_ER / HAVE_AVX512_VL / HAVE_AVX512_BW / HAVE_AVX512_DQ
+#     HAVE_AVX512_IFMA / HAVE_AVX512_VBMI / HAVE_ALTIVEC / HAVE_VSX
+#
+# LICENSE
+#
+#   Copyright (c) 2007 Christophe Tournayre <turn3r@users.sourceforge.net>
+#   Copyright (c) 2013,2015 Michael Petch <mpetch@capp-sysware.com>
+#   Copyright (c) 2017 Rafael de Lucena Valle <rafaeldelucena@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 18
+
+AC_DEFUN([AX_EXT],
+[
+  AC_REQUIRE([AC_CANONICAL_HOST])
+  AC_REQUIRE([AC_PROG_CC])
+
+  CPUEXT_FLAGS=""
+  SIMD_FLAGS=""
+
+  case $host_cpu in
+    powerpc*)
+      AC_CACHE_CHECK([whether altivec is supported for old distros], [ax_cv_have_altivec_old_ext],
+          [
+            if test `/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.optional.altivec` != 0; then
+                if test `/usr/sbin/sysctl -n hw.optional.altivec` = 1; then
+                  ax_cv_have_altivec_old_ext=yes
+                fi
+            fi
+          ])
+
+          if test "$ax_cv_have_altivec_old_ext" = yes; then
+            AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions])
+            AX_CHECK_COMPILE_FLAG(-faltivec, SIMD_FLAGS="$SIMD_FLAGS -faltivec", [])
+          fi
+
+      AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
+          [
+            if test `LD_SHOW_AUXV=1 /bin/true 2>/dev/null|grep -c altivec` != 0; then
+              ax_cv_have_altivec_ext=yes
+            fi
+          ])
+
+          if test "$ax_cv_have_altivec_ext" = yes; then
+            AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions])
+            AX_CHECK_COMPILE_FLAG(-maltivec, SIMD_FLAGS="$SIMD_FLAGS -maltivec", [])
+          fi
+
+      AC_CACHE_CHECK([whether vsx is supported], [ax_cv_have_vsx_ext],
+          [
+            if test `LD_SHOW_AUXV=1 /bin/true 2>/dev/null|grep -c vsx` != 0; then
+                ax_cv_have_vsx_ext=yes
+            fi
+          ])
+
+          if test "$ax_cv_have_vsx_ext" = yes; then
+            AC_DEFINE(HAVE_VSX,,[Support VSX instructions])
+            AX_CHECK_COMPILE_FLAG(-mvsx, SIMD_FLAGS="$SIMD_FLAGS -mvsx", [])
+          fi
+    ;;
+
+    i[[3456]]86*|x86_64*|amd64*)
+
+      AC_REQUIRE([AX_GCC_X86_CPUID])
+      AC_REQUIRE([AX_GCC_X86_CPUID_COUNT])
+      AC_REQUIRE([AX_GCC_X86_AVX_XGETBV])
+
+      eax_cpuid0=0
+      AX_GCC_X86_CPUID(0x00000000)
+      if test "$ax_cv_gcc_x86_cpuid_0x00000000" != "unknown";
+      then
+        eax_cpuid0=`echo $ax_cv_gcc_x86_cpuid_0x00000000 | cut -d ":" -f 1`
+      fi
+
+      eax_cpuid80000000=0
+      AX_GCC_X86_CPUID(0x80000000)
+      if test "$ax_cv_gcc_x86_cpuid_0x80000000" != "unknown";
+      then
+        eax_cpuid80000000=`echo $ax_cv_gcc_x86_cpuid_0x80000000 | cut -d ":" -f 1`
+      fi
+
+      ecx_cpuid1=0
+      edx_cpuid1=0
+      if test "$((0x$eax_cpuid0))" -ge 1 ; then
+        AX_GCC_X86_CPUID(0x00000001)
+        if test "$ax_cv_gcc_x86_cpuid_0x00000001" != "unknown";
+        then
+          ecx_cpuid1=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3`
+          edx_cpuid1=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4`
+        fi
+      fi
+
+      ebx_cpuid7=0
+      ecx_cpuid7=0
+      if test "$((0x$eax_cpuid0))" -ge 7 ; then
+        AX_GCC_X86_CPUID_COUNT(0x00000007, 0x00)
+        if test "$ax_cv_gcc_x86_cpuid_0x00000007" != "unknown";
+        then
+          ebx_cpuid7=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 2`
+          ecx_cpuid7=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 3`
+        fi
+      fi
+
+      ecx_cpuid80000001=0
+      edx_cpuid80000001=0
+      if test "$((0x$eax_cpuid80000000))" -ge "$((0x80000001))" ; then
+        AX_GCC_X86_CPUID(0x80000001)
+        if test "$ax_cv_gcc_x86_cpuid_0x80000001" != "unknown";
+        then
+          ecx_cpuid80000001=`echo $ax_cv_gcc_x86_cpuid_0x80000001 | cut -d ":" -f 3`
+          edx_cpuid80000001=`echo $ax_cv_gcc_x86_cpuid_0x80000001 | cut -d ":" -f 4`
+        fi
+      fi
+
+      AC_CACHE_VAL([ax_cv_have_mmx_os_support_ext],
+      [
+        ax_cv_have_mmx_os_support_ext=yes
+      ])
+
+      ax_cv_have_none_os_support_ext=yes
+
+      AC_CACHE_VAL([ax_cv_have_sse_os_support_ext],
+      [
+        ax_cv_have_sse_os_support_ext=no,
+        if test "$((0x$edx_cpuid1>>25&0x01))" = 1; then
+          AC_LANG_PUSH([C])
+          AC_RUN_IFELSE([AC_LANG_SOURCE([[
+#include <signal.h>
+#include <stdlib.h>
+            /* No way at ring1 to ring3 in protected mode to check the CR0 and CR4
+               control registers directly. Execute an SSE instruction.
+               If it raises SIGILL then OS doesn't support SSE based instructions */
+            void sig_handler(int signum){ exit(1); }
+            int main(){
+              signal(SIGILL, sig_handler);
+              /* SSE instruction xorps  %xmm0,%xmm0 */
+              __asm__ __volatile__ (".byte 0x0f, 0x57, 0xc0");
+              return 0;
+            }]])],
+            [ax_cv_have_sse_os_support_ext=yes],
+            [ax_cv_have_sse_os_support_ext=no],
+            [ax_cv_have_sse_os_support_ext=no])
+          AC_LANG_POP([C])
+        fi
+      ])
+
+      xgetbv_eax=0
+      if test "$((0x$ecx_cpuid1>>28&0x01))" = 1; then
+        AX_GCC_X86_AVX_XGETBV(0x00000000)
+
+        if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then
+          xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1`
+        fi
+
+        AC_CACHE_VAL([ax_cv_have_avx_os_support_ext],
+        [
+          ax_cv_have_avx_os_support_ext=no
+          if test "$((0x$ecx_cpuid1>>27&0x01))" = 1; then
+            if test "$((0x$xgetbv_eax&0x6))" = 6; then
+              ax_cv_have_avx_os_support_ext=yes
+            fi
+          fi
+        ])
+      fi
+
+      AC_CACHE_VAL([ax_cv_have_avx512_os_support_ext],
+      [
+        ax_cv_have_avx512_os_support_ext=no
+        if test "$ax_cv_have_avx_os_support_ext" = yes; then
+          if test "$((0x$xgetbv_eax&0xe6))" = "$((0xe6))"; then
+            ax_cv_have_avx512_os_support_ext=yes
+          fi
+        fi
+      ])
+
+      for ac_instr_info dnl
+      in "none;rdrnd;RDRND;ecx_cpuid1,30;-mrdrnd;HAVE_RDRND;CPUEXT_FLAGS" dnl
+         "none;bmi1;BMI1;ebx_cpuid7,3;-mbmi;HAVE_BMI1;CPUEXT_FLAGS" dnl
+         "none;bmi2;BMI2;ebx_cpuid7,8;-mbmi2;HAVE_BMI2;CPUEXT_FLAGS" dnl
+         "none;adx;ADX;ebx_cpuid7,19;-madx;HAVE_ADX;CPUEXT_FLAGS" dnl
+         "none;mpx;MPX;ebx_cpuid7,14;-mmpx;HAVE_MPX;CPUEXT_FLAGS" dnl
+         "none;prefetchwt1;PREFETCHWT1;ecx_cpuid7,0;-mprefetchwt1;HAVE_PREFETCHWT1;CPUEXT_FLAGS" dnl
+         "none;abm;ABM;ecx_cpuid80000001,5;-mabm;HAVE_ABM;CPUEXT_FLAGS" dnl
+         "mmx;mmx;MMX;edx_cpuid1,23;-mmmx;HAVE_MMX;SIMD_FLAGS" dnl
+         "sse;sse;SSE;edx_cpuid1,25;-msse;HAVE_SSE;SIMD_FLAGS" dnl
+         "sse;sse2;SSE2;edx_cpuid1,26;-msse2;HAVE_SSE2;SIMD_FLAGS" dnl
+         "sse;sse3;SSE3;ecx_cpuid1,1;-msse3;HAVE_SSE3;SIMD_FLAGS" dnl
+         "sse;ssse3;SSSE3;ecx_cpuid1,9;-mssse3;HAVE_SSSE3;SIMD_FLAGS" dnl
+         "sse;sse41;SSE4.1;ecx_cpuid1,19;-msse4.1;HAVE_SSE4_1;SIMD_FLAGS" dnl
+         "sse;sse42;SSE4.2;ecx_cpuid1,20;-msse4.2;HAVE_SSE4_2;SIMD_FLAGS" dnl
+         "sse;sse4a;SSE4a;ecx_cpuid80000001,6;-msse4a;HAVE_SSE4a;SIMD_FLAGS" dnl
+         "sse;sha;SHA;ebx_cpuid7,29;-msha;HAVE_SHA;SIMD_FLAGS" dnl
+         "sse;aes;AES;ecx_cpuid1,25;-maes;HAVE_AES;SIMD_FLAGS" dnl
+         "avx;avx;AVX;ecx_cpuid1,28;-mavx;HAVE_AVX;SIMD_FLAGS" dnl
+         "avx;fma3;FMA3;ecx_cpuid1,12;-mfma;HAVE_FMA3;SIMD_FLAGS" dnl
+         "avx;fma4;FMA4;ecx_cpuid80000001,16;-mfma4;HAVE_FMA4;SIMD_FLAGS" dnl
+         "avx;xop;XOP;ecx_cpuid80000001,11;-mxop;HAVE_XOP;SIMD_FLAGS" dnl
+         "avx;avx2;AVX2;ebx_cpuid7,5;-mavx2;HAVE_AVX2;SIMD_FLAGS" dnl
+         "avx512;avx512f;AVX512-F;ebx_cpuid7,16;-mavx512f;HAVE_AVX512_F;SIMD_FLAGS" dnl
+         "avx512;avx512cd;AVX512-CD;ebx_cpuid7,28;-mavx512cd;HAVE_AVX512_CD;SIMD_FLAGS" dnl
+         "avx512;avx512pf;AVX512-PF;ebx_cpuid7,26;-mavx512pf;HAVE_AVX512_PF;SIMD_FLAGS" dnl
+         "avx512;avx512er;AVX512-ER;ebx_cpuid7,27;-mavx512er;HAVE_AVX512_ER;SIMD_FLAGS" dnl
+         "avx512;avx512vl;AVX512-VL;ebx_cpuid7,31;-mavx512vl;HAVE_AVX512_VL;SIMD_FLAGS" dnl
+         "avx512;avx512bw;AVX512-BW;ebx_cpuid7,30;-mavx512bw;HAVE_AVX512_BW;SIMD_FLAGS" dnl
+         "avx512;avx512dq;AVX512-DQ;ebx_cpuid7,17;-mavx512dq;HAVE_AVX512_DQ;SIMD_FLAGS" dnl
+         "avx512;avx512ifma;AVX512-IFMA;ebx_cpuid7,21;-mavx512ifma;HAVE_AVX512_IFMA;SIMD_FLAGS" dnl
+         "avx512;avx512vbmi;AVX512-VBMI;ecx_cpuid7,1;-mavx512vbmi;HAVE_AVX512_VBMI;SIMD_FLAGS" dnl
+         #
+      do ac_instr_os_support=$(eval echo \$ax_cv_have_$(echo $ac_instr_info | cut -d ";" -f 1)_os_support_ext)
+         ac_instr_acvar=$(echo $ac_instr_info | cut -d ";" -f 2)
+         ac_instr_shortname=$(echo $ac_instr_info | cut -d ";" -f 3)
+         ac_instr_chk_loc=$(echo $ac_instr_info | cut -d ";" -f 4)
+         ac_instr_chk_reg=0x$(eval echo \$$(echo $ac_instr_chk_loc | cut -d "," -f 1))
+         ac_instr_chk_bit=$(echo $ac_instr_chk_loc | cut -d "," -f 2)
+         ac_instr_compiler_flags=$(echo $ac_instr_info | cut -d ";" -f 5)
+         ac_instr_have_define=$(echo $ac_instr_info | cut -d ";" -f 6)
+         ac_instr_flag_type=$(echo $ac_instr_info | cut -d ";" -f 7)
+
+         AC_CACHE_CHECK([whether ${ac_instr_shortname} is supported by the processor], [ax_cv_have_${ac_instr_acvar}_cpu_ext],
+         [
+           eval ax_cv_have_${ac_instr_acvar}_cpu_ext=no
+           if test "$((${ac_instr_chk_reg}>>${ac_instr_chk_bit}&0x01))" = 1 ; then
+             eval ax_cv_have_${ac_instr_acvar}_cpu_ext=yes
+           fi
+         ])
+
+         if test x"$(eval echo \$ax_cv_have_${ac_instr_acvar}_cpu_ext)" = x"yes"; then
+           AC_CACHE_CHECK([whether ${ac_instr_shortname} is supported by the processor and OS], [ax_cv_have_${ac_instr_acvar}_ext],
+           [
+             eval ax_cv_have_${ac_instr_acvar}_ext=no
+             if test x"${ac_instr_os_support}" = x"yes"; then
+               eval ax_cv_have_${ac_instr_acvar}_ext=yes
+             fi
+           ])
+
+           if test "$(eval echo \$ax_cv_have_${ac_instr_acvar}_ext)" = yes; then
+             AX_CHECK_COMPILE_FLAG(${ac_instr_compiler_flags}, eval ax_cv_support_${ac_instr_acvar}_ext=yes,
+                                                               eval ax_cv_support_${ac_instr_acvar}_ext=no)
+             if test x"$(eval echo \$ax_cv_support_${ac_instr_acvar}_ext)" = x"yes"; then
+               eval ${ac_instr_flag_type}=\"\$${ac_instr_flag_type} ${ac_instr_compiler_flags}\"
+               AC_DEFINE_UNQUOTED([${ac_instr_have_define}])
+             else
+               AC_MSG_WARN([Your processor and OS supports ${ac_instr_shortname} instructions but not your compiler, can you try another compiler?])
+             fi
+           else
+             if test x"${ac_instr_os_support}" = x"no"; then
+               AC_CACHE_VAL(ax_cv_support_${ac_instr_acvar}_ext, eval ax_cv_support_${ac_instr_acvar}_ext=no)
+               AC_MSG_WARN([Your processor supports ${ac_instr_shortname}, but your OS doesn't])
+             fi
+           fi
+         else
+           AC_CACHE_VAL(ax_cv_have_${ac_instr_acvar}_ext, eval ax_cv_have_${ac_instr_acvar}_ext=no)
+           AC_CACHE_VAL(ax_cv_support_${ac_instr_acvar}_ext, eval ax_cv_support_${ac_instr_acvar}_ext=no)
+         fi
+      done
+  ;;
+  esac
+
+  AH_TEMPLATE([HAVE_RDRND],[Define to 1 to support Digital Random Number Generator])
+  AH_TEMPLATE([HAVE_BMI1],[Define to 1 to support Bit Manipulation Instruction Set 1])
+  AH_TEMPLATE([HAVE_BMI2],[Define to 1 to support Bit Manipulation Instruction Set 2])
+  AH_TEMPLATE([HAVE_ADX],[Define to 1 to support Multi-Precision Add-Carry Instruction Extensions])
+  AH_TEMPLATE([HAVE_MPX],[Define to 1 to support Memory Protection Extensions])
+  AH_TEMPLATE([HAVE_PREFETCHWT1],[Define to 1 to support Prefetch Vector Data Into Caches WT1])
+  AH_TEMPLATE([HAVE_ABM],[Define to 1 to support Advanced Bit Manipulation])
+  AH_TEMPLATE([HAVE_MMX],[Define to 1 to support Multimedia Extensions])
+  AH_TEMPLATE([HAVE_SSE],[Define to 1 to support Streaming SIMD Extensions])
+  AH_TEMPLATE([HAVE_SSE2],[Define to 1 to support Streaming SIMD Extensions])
+  AH_TEMPLATE([HAVE_SSE3],[Define to 1 to support Streaming SIMD Extensions 3])
+  AH_TEMPLATE([HAVE_SSSE3],[Define to 1 to support Supplemental Streaming SIMD Extensions 3])
+  AH_TEMPLATE([HAVE_SSE4_1],[Define to 1 to support Streaming SIMD Extensions 4.1])
+  AH_TEMPLATE([HAVE_SSE4_2],[Define to 1 to support Streaming SIMD Extensions 4.2])
+  AH_TEMPLATE([HAVE_SSE4a],[Define to 1 to support AMD Streaming SIMD Extensions 4a])
+  AH_TEMPLATE([HAVE_SHA],[Define to 1 to support Secure Hash Algorithm Extension])
+  AH_TEMPLATE([HAVE_AES],[Define to 1 to support Advanced Encryption Standard New Instruction Set (AES-NI)])
+  AH_TEMPLATE([HAVE_AVX],[Define to 1 to support Advanced Vector Extensions])
+  AH_TEMPLATE([HAVE_FMA3],[Define to 1 to support  Fused Multiply-Add Extensions 3])
+  AH_TEMPLATE([HAVE_FMA4],[Define to 1 to support Fused Multiply-Add Extensions 4])
+  AH_TEMPLATE([HAVE_XOP],[Define to 1 to support eXtended Operations Extensions])
+  AH_TEMPLATE([HAVE_AVX2],[Define to 1 to support Advanced Vector Extensions 2])
+  AH_TEMPLATE([HAVE_AVX512_F],[Define to 1 to support AVX-512 Foundation Extensions])
+  AH_TEMPLATE([HAVE_AVX512_CD],[Define to 1 to support AVX-512 Conflict Detection Instructions])
+  AH_TEMPLATE([HAVE_AVX512_PF],[Define to 1 to support AVX-512 Conflict Prefetch Instructions])
+  AH_TEMPLATE([HAVE_AVX512_ER],[Define to 1 to support AVX-512 Exponential & Reciprocal Instructions])
+  AH_TEMPLATE([HAVE_AVX512_VL],[Define to 1 to support AVX-512 Vector Length Extensions])
+  AH_TEMPLATE([HAVE_AVX512_BW],[Define to 1 to support AVX-512 Byte and Word Instructions])
+  AH_TEMPLATE([HAVE_AVX512_DQ],[Define to 1 to support AVX-512 Doubleword and Quadword Instructions])
+  AH_TEMPLATE([HAVE_AVX512_IFMA],[Define to 1 to support AVX-512 Integer Fused Multiply Add Instructions])
+  AH_TEMPLATE([HAVE_AVX512_VBMI],[Define to 1 to support AVX-512 Vector Byte Manipulation Instructions])
+  AC_SUBST(SIMD_FLAGS)
+  AC_SUBST(CPUEXT_FLAGS)
+])
+
diff -Nru elpa-2016.05.001/m4/ax_fc_check_define.m4 elpa-2019.11.001/m4/ax_fc_check_define.m4
--- elpa-2016.05.001/m4/ax_fc_check_define.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/m4/ax_fc_check_define.m4	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,16 @@
+dnl
+dnl AX_FC_CHECK_DEFINE(MACRONAME, [ACTION_IF_DEFINED], [ACTION_IF_NOT_DEFINED])
+dnl
+AC_DEFUN([AX_FC_CHECK_DEFINE], [
+        AC_LANG_PUSH([Fortran])
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+program test_define
+#ifndef $1
+  choke me
+#endif
+end program
+        ])],
+        [$2],
+        [$3])
+        AC_LANG_POP([Fortran])
+])
diff -Nru elpa-2016.05.001/m4/ax_gcc_x86_avx_xgetbv.m4 elpa-2019.11.001/m4/ax_gcc_x86_avx_xgetbv.m4
--- elpa-2016.05.001/m4/ax_gcc_x86_avx_xgetbv.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/m4/ax_gcc_x86_avx_xgetbv.m4	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,80 @@
+# ===========================================================================
+#  https://www.gnu.org/software/autoconf-archive/ax_gcc_x86_avx_xgetbv.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_GCC_X86_AVX_XGETBV
+#
+# DESCRIPTION
+#
+#   On later x86 processors with AVX SIMD support, with gcc or a compiler
+#   that has a compatible syntax for inline assembly instructions, run a
+#   small program that executes the xgetbv instruction with input OP. This
+#   can be used to detect if the OS supports AVX instruction usage.
+#
+#   On output, the values of the eax and edx registers are stored as
+#   hexadecimal strings as "eax:edx" in the cache variable
+#   ax_cv_gcc_x86_avx_xgetbv.
+#
+#   If the xgetbv instruction fails (because you are running a
+#   cross-compiler, or because you are not using gcc, or because you are on
+#   a processor that doesn't have this instruction),
+#   ax_cv_gcc_x86_avx_xgetbv_OP is set to the string "unknown".
+#
+#   This macro mainly exists to be used in AX_EXT.
+#
+# LICENSE
+#
+#   Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 3
+
+AC_DEFUN([AX_GCC_X86_AVX_XGETBV],
+[AC_REQUIRE([AC_PROG_CC])
+AC_LANG_PUSH([C])
+AC_CACHE_CHECK(for x86-AVX xgetbv $1 output, ax_cv_gcc_x86_avx_xgetbv_$1,
+ [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
+     int op = $1, eax, edx;
+     FILE *f;
+      /* Opcodes for xgetbv */
+      __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0"
+        : "=a" (eax), "=d" (edx)
+        : "c" (op));
+     f = fopen("conftest_xgetbv", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x\n", eax, edx);
+     fclose(f);
+     return 0;
+])],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=`cat conftest_xgetbv`; rm -f conftest_xgetbv],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=unknown; rm -f conftest_xgetbv],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=unknown])])
+AC_LANG_POP([C])
+])
+
diff -Nru elpa-2016.05.001/m4/ax_gcc_x86_cpuid.m4 elpa-2019.11.001/m4/ax_gcc_x86_cpuid.m4
--- elpa-2016.05.001/m4/ax_gcc_x86_cpuid.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/m4/ax_gcc_x86_cpuid.m4	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,90 @@
+
+# ===========================================================================
+#     https://www.gnu.org/software/autoconf-archive/ax_gcc_x86_cpuid.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_GCC_X86_CPUID(OP)
+#   AX_GCC_X86_CPUID_COUNT(OP, COUNT)
+#
+# DESCRIPTION
+#
+#   On Pentium and later x86 processors, with gcc or a compiler that has a
+#   compatible syntax for inline assembly instructions, run a small program
+#   that executes the cpuid instruction with input OP. This can be used to
+#   detect the CPU type. AX_GCC_X86_CPUID_COUNT takes an additional COUNT
+#   parameter that gets passed into register ECX before calling cpuid.
+#
+#   On output, the values of the eax, ebx, ecx, and edx registers are stored
+#   as hexadecimal strings as "eax:ebx:ecx:edx" in the cache variable
+#   ax_cv_gcc_x86_cpuid_OP.
+#
+#   If the cpuid instruction fails (because you are running a
+#   cross-compiler, or because you are not using gcc, or because you are on
+#   a processor that doesn't have this instruction), ax_cv_gcc_x86_cpuid_OP
+#   is set to the string "unknown".
+#
+#   This macro mainly exists to be used in AX_GCC_ARCHFLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#   Copyright (c) 2008 Matteo Frigo
+#   Copyright (c) 2015 Michael Petch <mpetch@capp-sysware.com>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 10
+
+AC_DEFUN([AX_GCC_X86_CPUID],
+[AX_GCC_X86_CPUID_COUNT($1, 0)
+])
+
+AC_DEFUN([AX_GCC_X86_CPUID_COUNT],
+[AC_REQUIRE([AC_PROG_CC])
+AC_LANG_PUSH([C])
+AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1,
+ [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
+     int op = $1, level = $2, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__ __volatile__ ("xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n"
+        : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op), "2" (level));
+
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
+])],
+     [ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid],
+     [ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid],
+     [ax_cv_gcc_x86_cpuid_$1=unknown])])
+AC_LANG_POP([C])
+])
diff -Nru elpa-2016.05.001/m4/m4_ax_check_compile_flag.m4 elpa-2019.11.001/m4/m4_ax_check_compile_flag.m4
--- elpa-2016.05.001/m4/m4_ax_check_compile_flag.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/m4/m4_ax_check_compile_flag.m4	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,74 @@
+# ===========================================================================
+#  https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
+#
+# DESCRIPTION
+#
+#   Check whether the given FLAG works with the current language's compiler
+#   or gives an error.  (Warnings, however, are ignored)
+#
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#
+#   If EXTRA-FLAGS is defined, it is added to the current language's default
+#   flags (e.g. CFLAGS) when the check is done.  The check is thus made with
+#   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
+#   force the compiler to issue an error when a bad flag is given.
+#
+#   INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+#   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 5
+
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+  AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
+    [AS_VAR_SET(CACHEVAR,[yes])],
+    [AS_VAR_SET(CACHEVAR,[no])])
+  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_VAR_IF(CACHEVAR,yes,
+  [m4_default([$2], :)],
+  [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS
diff -Nru elpa-2016.05.001/Makefile.am elpa-2019.11.001/Makefile.am
--- elpa-2016.05.001/Makefile.am	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/Makefile.am	2019-12-19 09:47:43.000000000 +0000
@@ -2,169 +2,485 @@
 
 ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
 
-AM_FCFLAGS = $(SCALAPACK_FCFLAGS) @FC_MODINC@modules @FC_MODOUT@modules
+AM_FCFLAGS = $(SCALAPACK_FCFLAGS) $(FC_MODINC)modules
 AM_LDFLAGS = $(SCALAPACK_LDFLAGS)
 
+FCLD = @ACTUAL_FC@
+
 # libelpa
 lib_LTLIBRARIES = libelpa@SUFFIX@.la
-libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
-
-libelpa@SUFFIX@_la_SOURCES = \
-        src/mod_precision.f90 \
-        src/mod_mpi.F90 \
-        src/mod_mpi_stubs.F90 \
-        src/elpa2_kernels/mod_fortran_interfaces.F90 \
-        src/elpa_utilities.F90 \
-        src/elpa1_compute.F90 \
-        src/elpa1.F90 \
-        src/elpa2_utilities.F90 \
-        src/mod_pack_unpack_real.F90 \
-        src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
-        src/mod_compute_hh_trafo_real.F90 \
-        src/mod_compute_hh_trafo_complex.F90 \
-        src/mod_pack_unpack_complex.F90 \
-        src/aligned_mem.F90 \
-        src/elpa2_compute.F90 \
-        src/elpa2.F90 \
-        src/elpa_c_interface.F90 \
-        src/elpa_qr/qr_utils.F90 \
-        src/elpa_qr/elpa_qrkernels.f90 \
-        src/elpa_qr/elpa_pdlarfb.F90 \
-        src/elpa_qr/elpa_pdgeqrf.F90
-
-EXTRA_libelpa@SUFFIX@_la_DEPENDENCIES = \
-        src/elpa_reduce_add_vectors.X90 \
-        src/elpa_transpose_vectors.X90 \
-        src/redist_band.X90
+libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION)
+libelpa@SUFFIX@_la_LIBADD = libelpa@SUFFIX@_public.la libelpa@SUFFIX@_private.la
+libelpa@SUFFIX@_la_SOURCES =
+
+# parts with public interface
+noinst_LTLIBRARIES = libelpa@SUFFIX@_public.la
+libelpa@SUFFIX@_public_la_FCFLAGS = $(AM_FCFLAGS) $(FC_MODOUT)modules $(FC_MODINC)private_modules
+libelpa@SUFFIX@_public_la_SOURCES = \
+  src/elpa.F90 \
+  src/elpa_api.F90 \
+  src/elpa_constants.F90
+
+# internal parts
+noinst_LTLIBRARIES += libelpa@SUFFIX@_private.la
+libelpa@SUFFIX@_private_la_FCFLAGS = $(AM_FCFLAGS) $(FC_MODOUT)private_modules $(FC_MODINC)private_modules
+libelpa@SUFFIX@_private_la_SOURCES = \
+  src/elpa_impl.F90 \
+  src/elpa_autotune_impl.F90 \
+  src/elpa_abstract_impl.F90 \
+  src/helpers/mod_precision.F90 \
+  src/helpers/mod_blas_interfaces.F90 \
+  src/helpers/mod_scalapack_interfaces.F90 \
+  src/helpers/mod_mpi.F90 \
+  src/helpers/mod_mpi_stubs.F90 \
+  src/helpers/mod_omp.F90 \
+  src/elpa_generated_fortran_interfaces.F90 \
+  src/elpa2/mod_redist_band.F90 \
+  src/elpa2/mod_pack_unpack_cpu.F90 \
+  src/elpa2/mod_compute_hh_trafo.F90 \
+  src/helpers/aligned_mem.F90 \
+  src/elpa1/elpa1_compute_private.F90 \
+  src/elpa1/elpa1_auxiliary.F90 \
+  src/elpa2/elpa2_determine_workload.F90 \
+  src/elpa2/elpa2_compute.F90 \
+  src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
+  src/GPU/check_for_gpu.F90 \
+  src/GPU/mod_cuda.F90 \
+  src/elpa2/GPU/interface_c_kernel.F90 \
+  src/elpa2/mod_pack_unpack_gpu.F90 \
+  src/elpa2/qr/qr_utils.F90 \
+  src/elpa2/qr/elpa_qrkernels.F90 \
+  src/elpa2/qr/elpa_pdlarfb.F90 \
+  src/elpa2/qr/elpa_pdgeqrf.F90 \
+  src/elpa1/elpa1.F90 \
+  src/elpa2/elpa2.F90 \
+  src/elpa_generalized/cannon.c \
+  src/helpers/matrix_plot.F90 \
+  src/general/mod_elpa_skewsymmetric_blas.F90 \
+  src/elpa_index.c
+
+libelpa@SUFFIX@_private_la_SOURCES += src/elpa_c_interface.c 
+
+
+libelpa@SUFFIX@_private_la_SOURCES += \
+  src/general/elpa_utilities.F90
+
+EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
+  src/elpa1/elpa_reduce_add_vectors.F90 \
+  src/elpa1/elpa_transpose_vectors.F90 \
+  src/elpa_api_math_template.F90 \
+  src/elpa_impl_math_template.F90 \
+  src/elpa_impl_generalized_transform_template.F90 \
+  src/elpa1/elpa1_compute_template.F90 \
+  src/elpa2/elpa2_compute_real_template.F90 \
+  src/elpa2/elpa2_compute_complex_template.F90 \
+  src/elpa1/elpa1_template.F90 \
+  src/elpa2/elpa2_template.F90 \
+  src/elpa2/qr/qr_utils_template.F90 \
+  src/elpa2/qr/elpa_pdlarfb_template.F90 \
+  src/elpa2/qr/elpa_pdgeqrf_template.F90 \
+  src/elpa2/elpa2_bandred_template.F90 \
+  src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 \
+  src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
+  src/elpa2/elpa2_tridiag_band_template.F90 \
+  src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
+  src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90 \
+  src/elpa2/kernels/real_template.F90 \
+  src/elpa2/kernels/complex_template.F90 \
+  src/elpa2/kernels/simple_template.F90 \
+  src/elpa2/kernels/simple_block4_template.F90 \
+  src/elpa2/kernels/simple_block6_template.F90 \
+  src/elpa2/pack_unpack_cpu.F90 \
+  src/elpa2/pack_unpack_gpu.F90 \
+  src/elpa2/compute_hh_trafo.F90 \
+  src/elpa2/redist_band.F90 \
+  src/general/sanity.F90 \
+  src/elpa1/elpa_cholesky_template.F90 \
+  src/elpa1/elpa_invert_trm.F90 \
+  src/elpa1/elpa_multiply_a_b.F90 \
+  src/elpa1/elpa_solve_tridi_impl_public.F90 \
+  src/general/elpa_ssr2_template.F90 \
+  src/general/elpa_ssmv_template.F90 \
+  src/general/precision_macros.h \
+  src/general/precision_typedefs.h \
+  src/general/precision_kinds.F90
 
 if HAVE_DETAILED_TIMINGS
-  libelpa@SUFFIX@_la_SOURCES += \
-        src/timer.F90 \
-        src/ftimings/ftimings.F90 \
-        src/ftimings/ftimings_type.F90 \
-        src/ftimings/ftimings_value.F90 \
-        src/ftimings/highwater_mark.c \
-        src/ftimings/resident_set_size.c \
-        src/ftimings/time.c \
-        src/ftimings/virtual_memory.c \
-        src/ftimings/papi.c
+libelpa@SUFFIX@_private_la_SOURCES += \
+  src/ftimings/ftimings.F90 \
+  src/ftimings/ftimings_type.F90 \
+  src/ftimings/ftimings_value.F90 \
+  src/ftimings/highwater_mark.c \
+  src/ftimings/resident_set_size.c \
+  src/ftimings/time.c \
+  src/ftimings/virtual_memory.c \
+  src/ftimings/papi.c
+
+else
+libelpa@SUFFIX@_private_la_SOURCES += \
+  src/helpers/timer_dummy.F90
+endif
+
+if WITH_GPU_VERSION
+  libelpa@SUFFIX@_private_la_SOURCES +=  src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu
+  EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES +=   src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu
 endif
 
 if !WITH_MPI
-  libelpa@SUFFIX@_la_SOURCES += src/mod_time_c.F90
+  libelpa@SUFFIX@_private_la_SOURCES += src/helpers/mod_time_c.F90
 if !HAVE_DETAILED_TIMINGS
-  libelpa@SUFFIX@_la_SOURCES += src/ftimings/time.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/ftimings/time.c
+endif
 endif
+
+
+if HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+  libelpa@SUFFIX@_private_la_SOURCES +=  src/helpers/get_cpuid_set.c src/helpers/mod_simd_kernel.F90
 endif
 
 if WITH_REAL_GENERIC_KERNEL
-  libelpa@SUFFIX@_la_SOURCES +=  src/elpa2_kernels/elpa2_kernels_real.F90
+  libelpa@SUFFIX@_private_la_SOURCES +=  src/elpa2/kernels/real.F90
 endif
 
 if WITH_COMPLEX_GENERIC_KERNEL
-  libelpa@SUFFIX@_la_SOURCES +=  src/elpa2_kernels/elpa2_kernels_complex.F90
+  libelpa@SUFFIX@_private_la_SOURCES +=  src/elpa2/kernels/complex.F90
 endif
 
 if WITH_REAL_GENERIC_SIMPLE_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.F90
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple.F90
 endif
 
 if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.F90
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_simple.F90
+endif
+
+if WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block4.F90
+endif
+
+if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
 endif
 
 if WITH_REAL_BGP_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
 endif
 
 if WITH_REAL_BGQ_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgq.f90
 endif
 
 if WITH_REAL_SSE_ASSEMBLY_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/asm_x86_64_double_precision.s
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/asm_x86_64_single_precision.s
+endif
+
 else
 if WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/asm_x86_64_double_precision.s
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/asm_x86_64_single_precision.s
+endif
+endif
+endif
+
+if WITH_REAL_SPARC64_BLOCK2_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_2hv_double_precision.c
+#if WANT_SINGLE_PRECISION_REAL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_2hv_single_precision.c
+#endif
+endif
+
+if WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
+endif
+endif
+
+if WITH_REAL_VSX_BLOCK2_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_single_precision.c
 endif
 endif
 
 if WITH_REAL_SSE_BLOCK2_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_2hv_single_precision.c
+endif
 endif
 
 if WITH_REAL_AVX_BLOCK2_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c
+endif
+else
+if WITH_REAL_AVX2_BLOCK2_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c
+endif
+endif
+endif
+
+if WITH_REAL_AVX512_BLOCK2_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx512_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx512_2hv_single_precision.c
+endif
+endif
+
+if WITH_REAL_SPARC64_BLOCK4_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_double_precision.c
+#if WANT_SINGLE_PRECISION_REAL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
+#endif
+endif
+
+if WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
+endif
+endif
+
+if WITH_REAL_VSX_BLOCK4_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
+endif
 endif
 
 if WITH_REAL_SSE_BLOCK4_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_single_precision.c
+endif
 endif
 
 if WITH_REAL_AVX_BLOCK4_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c
+endif
+else
+if WITH_REAL_AVX2_BLOCK4_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c
+endif
+endif
+endif
+
+if WITH_REAL_AVX512_BLOCK4_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx512_4hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx512_4hv_single_precision.c
+endif
+endif
+
+if WITH_REAL_SPARC64_BLOCK6_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
+#if WANT_SINGLE_PRECISION_REAL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
+#endif
+endif
+
+if WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
+endif
+endif
+
+if WITH_REAL_VSX_BLOCK6_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_single_precision.c
+endif
 endif
 
 if WITH_REAL_SSE_BLOCK6_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_6hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_6hv_single_precision.c
+endif
 endif
 
 if WITH_REAL_AVX_BLOCK6_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c
+endif
+else
+if WITH_REAL_AVX2_BLOCK6_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c
+endif
+endif
 endif
 
+if WITH_REAL_AVX512_BLOCK6_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx512_6hv_double_precision.c
+if WANT_SINGLE_PRECISION_REAL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_avx512_6hv_single_precision.c
+endif
+endif
+
+#if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
+#if WANT_SINGLE_PRECISION_COMPLEX
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
+#endif
+#endif
+#
+#if WITH_COMPLEX_VSX_BLOCK1_KERNEL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_double_precision.c
+#if WANT_SINGLE_PRECISION_COMPLEX
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_single_precision.c
+#endif
+#endif
+
 if WITH_COMPLEX_SSE_BLOCK1_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_1hv_double_precision.c
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_1hv_single_precision.c
+endif
 endif
 
 if WITH_COMPLEX_AVX_BLOCK1_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c
+endif
+else
+if WITH_COMPLEX_AVX2_BLOCK1_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c
 endif
+endif
+endif
+
+
+if WITH_COMPLEX_AVX512_BLOCK1_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx512_1hv_double_precision.c
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx512_1hv_single_precision.c
+endif
+endif
+
+#if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
+#if WANT_SINGLE_PRECISION_COMPLEX
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
+#endif
+#endif
+#
+#if WITH_COMPLEX_VSX_BLOCK2_KERNEL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_double_precision.c
+#if WANT_SINGLE_PRECISION_COMPLEX
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_single_precision.c
+#endif
+#endif
 
 if WITH_COMPLEX_SSE_BLOCK2_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sse_2hv_single_precision.c
+endif
 endif
 
 if WITH_COMPLEX_AVX_BLOCK2_KERNEL
-  libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c
+endif
+else
+if WITH_COMPLEX_AVX2_BLOCK2_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c
+endif
+endif
+endif
+
+if WITH_COMPLEX_AVX512_BLOCK2_KERNEL
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx512_2hv_double_precision.c
+if WANT_SINGLE_PRECISION_COMPLEX
+  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_avx512_2hv_single_precision.c
+endif
 endif
 
+
+if STORE_BUILD_CONFIG
+  libelpa@SUFFIX@_private_la_SOURCES += src/helpers/print_build_config.c
+endif
+
+
+# Cuda files
+.cu.lo:
+	NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/  -c $< -o $@
+
+# Assembly files
+LTCCASCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \
+	$(CCASFLAGS)
+
 include generated_headers.am
 BUILT_SOURCES = $(generated_headers)
 
-# install any .mod files in the include/ dir
+# install public headers and Fortran modules files in the include/ dir
 elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
-nobase_elpa_include_HEADERS = $(wildcard modules/*)
-nobase_elpa_include_HEADERS += elpa/elpa.h elpa/elpa_kernel_constants.h elpa/elpa_generated.h
+nobase_elpa_include_HEADERS = \
+  $(wildcard modules/*) \
+  src/helpers/lapack_interfaces.h \
+  src/helpers/scalapack_interfaces.h \
+  elpa/elpa_simd_constants.h \
+  elpa/elpa.h \
+  elpa/elpa_generic.h
+
+nobase_nodist_elpa_include_HEADERS = \
+  elpa/elpa_version.h \
+  elpa/elpa_constants.h \
+  elpa/elpa_generated.h \
+  elpa/elpa_generated_c_api.h
 
 dist_man_MANS = \
-	   man/solve_evp_real.3 \
-	   man/solve_evp_real_1stage.3 \
-	   man/solve_evp_complex.3 \
-	   man/solve_evp_complex_1stage.3 \
-	   man/solve_evp_real_2stage.3 \
-	   man/solve_evp_complex_2stage.3 \
-	   man/get_elpa_row_col_comms.3 \
-	   man/get_elpa_communicators.3 \
-	   man/elpa2_print_kernels.1
-
-# other files to distribute
-filesdir = $(docdir)/examples
-dist_files_DATA = \
-  test/fortran_test_programs/read_real.F90 \
-  test/fortran_test_programs/test_complex2.F90 \
-  test/fortran_test_programs/test_complex2_default_kernel.F90 \
-  test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \
-  test/fortran_test_programs/test_complex.F90 \
-  test/fortran_test_programs/test_real2.F90 \
-  test/fortran_test_programs/test_real2_default_kernel.F90 \
-  test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \
-  test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \
-  test/fortran_test_programs/test_real.F90 \
-  test/fortran_test_programs/test_real_with_c.F90 \
-  src/elpa2_print_kernels.F90
+  man/elpa2_print_kernels.1 \
+  man/elpa_init.3 \
+  man/elpa_allocate.3 \
+  man/elpa_set.3 \
+  man/elpa_setup.3 \
+  man/elpa_eigenvalues.3 \
+  man/elpa_eigenvectors.3 \
+  man/elpa_skew_eigenvalues.3 \
+  man/elpa_skew_eigenvectors.3 \
+  man/elpa_generalized_eigenvectors.3 \
+  man/elpa_generalized_eigenvalues.3 \
+  man/elpa_cholesky.3 \
+  man/elpa_invert_triangular.3 \
+  man/elpa_solve_tridiagonal.3 \
+  man/elpa_hermitian_multiply.3 \
+  man/elpa_deallocate.3 \
+  man/elpa_load_settings.3 \
+  man/elpa_store_settings.3 \
+  man/elpa_print_settings.3 \
+  man/elpa_autotune_save_state.3 \
+  man/elpa_autotune_load_state.3 \
+  man/elpa_autotune_print_state.3 \
+  man/elpa_autotune_setup.3 \
+  man/elpa_autotune_step.3 \
+  man/elpa_autotune_set_best.3 \
+  man/elpa_autotune_deallocate.3 \
+  man/elpa_uninit.3
 
-dist_doc_DATA = README.md USERS_GUIDE.md INSTALL.md CONTRIBUTING.md LICENSE Changelog COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt
+dist_doc_DATA = README.md USERS_GUIDE.md USERS_GUIDE_DEPRECATED_LEGACY_API.md INSTALL.md CONTRIBUTING.md LICENSE Changelog COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt
 
 # pkg-config stuff
 pkgconfigdir = $(libdir)/pkgconfig
@@ -174,209 +490,280 @@
 bin_PROGRAMS = \
   elpa2_print_kernels@SUFFIX@
 
-noinst_PROGRAMS = \
-  elpa1_test_real@SUFFIX@ \
-  elpa1_test_complex@SUFFIX@ \
-  elpa2_test_real@SUFFIX@ \
-  elpa2_test_complex@SUFFIX@ \
-  elpa2_test_real_default_kernel@SUFFIX@ \
-  elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@ \
-  elpa2_test_complex_default_kernel@SUFFIX@ \
-  elpa2_test_real_choose_kernel_with_api@SUFFIX@ \
-  elpa2_test_complex_choose_kernel_with_api@SUFFIX@ \
-  elpa1_test_real_with_c@SUFFIX@
-if !WITH_OPENMP
-noinst_PROGRAMS += \
-  elpa1_test_real_c_version@SUFFIX@ \
-  elpa1_test_complex_c_version@SUFFIX@ \
-  elpa2_test_real_c_version@SUFFIX@ \
-  elpa2_test_complex_c_version@SUFFIX@
-endif
+noinst_PROGRAMS =
+check_SCRIPTS =
 
-build_lib = libelpa@SUFFIX@.la
+test_program_ldadd = libelpatest@SUFFIX@.la libelpa@SUFFIX@.la
+test_program_fcflags = $(AM_FCFLAGS) $(FC_MODOUT)test_modules $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
 
-if HAVE_REDIRECT
-  redirect_sources = test/shared_sources/redir.c test/shared_sources/redirect.F90
-else
-  redirect_sources =
+# library with shared sources for the test files
+noinst_LTLIBRARIES += libelpatest@SUFFIX@.la
+libelpatest@SUFFIX@_la_FCFLAGS = $(test_program_fcflags)
+libelpatest@SUFFIX@_la_SOURCES = \
+  test/shared/tests_variable_definitions.F90 \
+  test/shared/mod_tests_scalapack_interfaces.F90 \
+  test/shared/mod_tests_blas_interfaces.F90 \
+  test/shared/test_util.F90 \
+  test/shared/test_read_input_parameters.F90 \
+  test/shared/test_check_correctness.F90 \
+  test/shared/test_setup_mpi.F90 \
+  test/shared/test_blacs_infrastructure.F90 \
+  test/shared/test_prepare_matrix.F90 \
+  test/shared/test_analytic.F90 \
+  test/shared/test_output_type.F90
+
+if WITH_SCALAPACK_TESTS
+libelpatest@SUFFIX@_la_SOURCES += \
+  test/shared/test_scalapack.F90
 endif
 
-#test/shared_sources/mod_precision_created.f90: src/mod_precision.f90
-#	cp $(top_srcdir)/src/mod_precision.f90 $(top_srcdir)/test/shared_sources/mod_precision_created.f90
+if HAVE_REDIRECT
+libelpatest@SUFFIX@_la_SOURCES += \
+  test/shared/test_redir.c \
+  test/shared/test_redirect.F90
+endif
 
-shared_sources = test/shared_sources/util.F90 test/shared_sources/read_input_parameters.F90  \
-		 test/shared_sources/check_correctnes.F90 test/shared_sources/setup_mpi.F90 \
-		 test/shared_sources/blacs_infrastructure.F90 test/shared_sources/prepare_matrix.F90 \
-		 test/shared_sources/mod_output_types.F90
-if !WITH_OPENMP
-elpa1_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_real_c_version.c $(shared_sources) $(redirect_sources)
-elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib)
-elpa1_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-EXTRA_elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa1_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_complex_c_version.c $(shared_sources) $(redirect_sources)
-elpa1_test_complex_c_version@SUFFIX@_LDADD = $(build_lib)
-elpa1_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-EXTRA_elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa2_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_real_c_version.c $(shared_sources) $(redirect_sources)
-elpa2_test_real_c_version@SUFFIX@_LDADD = $(build_lib)
-elpa2_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-EXTRA_elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa2_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_complex_c_version.c $(shared_sources) $(redirect_sources)
-elpa2_test_complex_c_version@SUFFIX@_LDADD = $(build_lib)
-elpa2_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-EXTRA_elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-endif
-
-elpa1_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real.F90 $(shared_sources) $(redirect_sources)
-elpa1_test_real@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa1_test_real@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa1_test_real_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_real_with_c.F90 test/shared_sources/mod_from_c.F90 \
-					 test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources)
-elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-#elpa1_test_complex_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex_with_c.F90 test/shared_sources/mod_from_c.F90 test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources)
-#elpa1_test_complex_with_c@SUFFIX@_LDADD = $(build_lib)
-#EXTRA_elpa1_test_complex_with_c@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa2_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2.F90 $(shared_sources) $(redirect_sources)
-elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources)
-elpa2_test_real_default_kernel@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \
-								   $(shared_sources) $(redirect_sources)
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \
-							 $(shared_sources) $(redirect_sources)
-elpa2_test_real_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa1_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex.F90 $(shared_sources) $(redirect_sources)
-elpa1_test_complex@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa1_test_complex@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-
-elpa2_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2.F90 $(shared_sources) $(redirect_sources)
-elpa2_test_complex@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_complex@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa2_test_complex_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel.F90 $(shared_sources) $(redirect_sources)
-elpa2_test_complex_default_kernel@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \
-							    $(shared_sources) $(redirect_sources)
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-elpa2_print_kernels@SUFFIX@_SOURCES = src/elpa2_print_kernels.F90 $(shared_sources) $(redirect_sources)
-elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib)
-
-check_SCRIPTS = \
-  elpa1_test_real@SUFFIX@.sh \
-  elpa1_test_real_with_c@SUFFIX@.sh \
-  elpa2_test_real@SUFFIX@.sh \
-  elpa2_test_real_default_kernel@SUFFIX@.sh \
-  elpa1_test_complex@SUFFIX@.sh \
-  elpa2_test_complex@SUFFIX@.sh \
-  elpa2_test_complex_default_kernel@SUFFIX@.sh \
-  elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh \
-  elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \
-  elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \
-  elpa2_print_kernels@SUFFIX@
 
-if !WITH_OPENMP
-check_SCRIPTS += \
-  elpa1_test_real_c_version@SUFFIX@.sh \
-  elpa1_test_complex_c_version@SUFFIX@.sh \
-  elpa2_test_real_c_version@SUFFIX@.sh \
-  elpa2_test_complex_c_version@SUFFIX@.sh
+elpa2_print_kernels@SUFFIX@_SOURCES = src/elpa2/elpa2_print_kernels.F90
+elpa2_print_kernels@SUFFIX@_LDADD = libelpa@SUFFIX@.la
+elpa2_print_kernels@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)modules
+
+include test_programs.am
+
+#include test_programs_manual.am
+
+noinst_PROGRAMS += validate_double_instance@SUFFIX@
+check_SCRIPTS += validate_double_instance@SUFFIX@_default.sh
+validate_double_instance@SUFFIX@_SOURCES = test/Fortran/elpa2/double_instance.F90
+validate_double_instance@SUFFIX@_LDADD = $(test_program_ldadd)
+validate_double_instance@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+
+noinst_PROGRAMS += validate_real_2stage_banded@SUFFIX@
+check_SCRIPTS += validate_real_2stage_banded@SUFFIX@_default.sh
+validate_real_2stage_banded@SUFFIX@_SOURCES = test/Fortran/elpa2/real_2stage_banded.F90 
+validate_real_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
+validate_real_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+
+noinst_PROGRAMS += validate_complex_2stage_banded@SUFFIX@
+check_SCRIPTS += validate_complex_2stage_banded@SUFFIX@_default.sh
+validate_complex_2stage_banded@SUFFIX@_SOURCES = test/Fortran/elpa2/complex_2stage_banded.F90
+validate_complex_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
+validate_complex_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+
+if WANT_SINGLE_PRECISION_REAL
+noinst_PROGRAMS += validate_single_real_2stage_banded@SUFFIX@
+check_SCRIPTS += validate_single_real_2stage_banded@SUFFIX@_default.sh
+validate_single_real_2stage_banded@SUFFIX@_SOURCES = test/Fortran/elpa2/single_real_2stage_banded.F90
+validate_single_real_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
+validate_single_real_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+noinst_PROGRAMS += validate_single_complex_2stage_banded@SUFFIX@
+check_SCRIPTS += validate_single_complex_2stage_banded@SUFFIX@_default.sh
+validate_single_complex_2stage_banded@SUFFIX@_SOURCES = test/Fortran/elpa2/single_complex_2stage_banded.F90
+validate_single_complex_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
+validate_single_complex_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+endif
+
+# python wrapper
+pyelpadir = $(pythondir)/pyelpa
+if WITH_PYTHON
+pyelpa_PYTHON = python/pyelpa/__init__.py python/pyelpa/distributedmatrix.py
+pyelpa_LTLIBRARIES = wrapper.la
+else
+pyelpa_PYTHON =
+pyelpa_LTLIBRARIES =
 endif
+nodist_wrapper_la_SOURCES = python/pyelpa/wrapper.c
+wrapper_la_LDFLAGS = -module -avoid-version -shared $(AM_LDFLAGS)
+wrapper_la_LIBADD = libelpa@SUFFIX@.la
+wrapper_la_CFLAGS = $(PYTHON_INCLUDE) $(NUMPY_INCLUDE) $(AM_CFLAGS)
 
-
+python/pyelpa/wrapper.c: python/pyelpa/wrapper.pyx
+	cython $< -o $@
 # test scripts
+TASKS ?= 2
 if WITH_MPI
-  wrapper="mpiexec -n 2 "
+  wrapper=$(MPI_BINARY) -n $${TASKS:-$(TASKS)}
 else
-  wrapper=""
+  wrapper=
 endif
 TESTS = $(check_SCRIPTS)
-%.sh: %
-	echo '$(wrapper)./$^ $$TEST_FLAGS' > $@
-	chmod +x $@
-
-## this one does not want any arguments
-#elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh:
-#	echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > $@
-#	chmod +x $@
-
-#elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@.sh:
-#	echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > $@
-#	chmod +x $@
-
-# Preprocessed files (just used for manual inspection)
-elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@
-
-elpa2.i: $(top_srcdir)/src/elpa2.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2.F90 -o $@
-
-elpa1.i: $(top_srcdir)/src/elpa1.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
-
-elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@
-
-mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
-
-mod_compute_hh_trafo_complex.i: $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 -o $@
-
+%_extended.sh: %
+	@echo "#!/bin/bash" > $@
+	@echo 'if [ "$$CHECK_LEVEL" = "extended" ] ; then $(wrapper) ./$^ $$TEST_FLAGS ; else exit 77; fi' >> $@
+	@chmod +x $@
+
+%_default.sh: %
+	@echo "#!/bin/bash" > $@
+	@echo '$(wrapper)' ./$^ '$$TEST_FLAGS' >> $@
+	@chmod +x $@
+
+if WITH_PYTHON_TESTS
+check_SCRIPTS += test_python.sh
+endif
+
+test_python.sh:
+	@echo '#!/bin/bash' > $@
+# this is kind of hacky... is there a better way to get wrapper.so?
+	@echo 'export PYTHONPATH=./python-copy:$$PYTHONPATH' >> $@
+	@echo 'cp -r $(abs_top_srcdir)/python python-copy || exit 1' >> $@
+	@echo 'chmod u+rwX -R python-copy || exit 1' >> $@
+	@echo 'cp .libs/wrapper.so python-copy/pyelpa/ || exit 1' >> $@
+# the dlopen flags are needed for MKL to work properly...
+# only in os from python 3.3 on
+	@echo "$(wrapper) $(PYTHON) -c 'import sys, os; sys.setdlopenflags(os.RTLD_NOW | os.RTLD_GLOBAL); import pytest; sys.exit(pytest.main([\"./python-copy\", \"-p\", \"no:cacheprovider\"]))'" >> $@
+	@echo 'exit_code=$$?' >> $@
+	@echo 'rm -rf python-copy || exit 1' >> $@
+	@echo 'exit $$exit_code' >> $@
+	@chmod +x $@
 
 include doxygen.am
 
 CLEANFILES = \
-  elpa-generated.h \
+  elpa_generated.h \
+  elpa_generated_c_api.h \
   elpa1_test* \
   elpa2_test*\
-  *.i
+  elpa2_real* \
+  elpa1_real* \
+  elpa*.sh \
+  test*.sh \
+  single_real* \
+  single_complex* \
+  real* \
+  complex* \
+  double_instance* \
+  *.i \
+  python/pyelpa/wrapper.c \
+  check_python.sh
 
 clean-local:
-	-rm -rf modules/* .fortran_dependencies/*
+	-rm -rf modules/* private_modules/* test_modules/* .fortran_dependencies/*
+	-rm -rf validate_*.sh
+	-rm -rf real_2stage*.sh
+	-rm -rf complex_2stage*.sh
+	-rm -rf single_complex_2stage*.sh
+	-rm -rf single_real_2stage*.sh
+	-rm -rf double_instance_onenode*.sh
 	-rm -rf $(generated_headers)
 
 distclean-local:
+	-rm -rf ./m4
+	-rm -rf ./src
+	-rm -rf ./test
+	-rm -rf ./modules
+	-rm -rf .fortran_dependencies
 	-rm config-f90.h
-	-rm -rf ./src/elpa2_kernels/.deps
-	-rm -rf ./src/.deps
 	-rm -rf ./test/.deps
-	-rmdir ./src/elpa2_kernels/
-	-rmdir ./src
-	-rmdir ./test
-	-rmdir ./m4
-	-rmdir modules/
-	-rmdir .fortran_dependencies/
+	-rm -rf elpa/elpa_generated_c_api.h
 
 EXTRA_DIST = \
-  fdep/fortran_dependencies.pl \
+  elpa.spec \
+  elpa/elpa.h \
+  elpa/elpa_generic.h \
   fdep/fortran_dependencies.mk \
-  test/fortran_test_programs/elpa_test_programs_print_headers.X90 \
-  src/elpa_reduce_add_vectors.X90 \
-  src/elpa_transpose_vectors.X90 \
-  src/redist_band.X90 \
-  elpa.spec
+  fdep/fortran_dependencies.pl \
+  manual_cpp \
+  nvcc_wrap \
+  remove_xcompiler \
+  src/helpers/fortran_blas_interfaces.F90 \
+  src/helpers/fortran_scalapack_interfaces.F90 \
+  src/GPU/cuUtils_template.cu \
+  src/elpa_api_math_template.F90 \
+  src/elpa_impl_math_template.F90 \
+  src/elpa_impl_generalized_transform_template.F90 \
+  src/elpa1/elpa1_compute_template.F90 \
+  src/elpa1/elpa1_merge_systems_real_template.F90 \
+  src/elpa1/elpa1_solve_tridi_real_template.F90 \
+  src/elpa1/elpa1_template.F90 \
+  src/elpa1/elpa1_tools_template.F90 \
+  src/elpa1/elpa1_trans_ev_template.F90 \
+  src/elpa1/elpa1_tridiag_template.F90 \
+  src/elpa1/elpa_cholesky_template.F90 \
+  src/elpa1/elpa_invert_trm.F90 \
+  src/elpa1/elpa_multiply_a_b.F90 \
+  src/elpa1/elpa_reduce_add_vectors.F90 \
+  src/elpa1/elpa_solve_tridi_impl_public.F90 \
+  src/elpa1/elpa_transpose_vectors.F90 \
+  src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu \
+  src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu \
+  src/elpa2/compute_hh_trafo.F90 \
+  src/elpa2/elpa2_bandred_template.F90 \
+  src/elpa2/elpa2_compute_complex_template.F90 \
+  src/elpa2/elpa2_compute_real_template.F90 \
+  src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90 \
+  src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 \
+  src/elpa2/elpa2_template.F90 \
+  src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
+  src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
+  src/elpa2/elpa2_tridiag_band_template.F90 \
+  src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
+  src/elpa2/kernels/complex_template.F90 \
+  src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
+  src/elpa2/kernels/real_template.F90 \
+  src/elpa2/kernels/simple_template.F90 \
+  src/elpa2/kernels/simple_block4_template.F90 \
+  src/elpa2/kernels/simple_block6_template.F90 \
+  src/elpa2/pack_unpack_cpu.F90 \
+  src/elpa2/pack_unpack_gpu.F90 \
+  src/elpa2/qr/elpa_pdgeqrf_template.F90 \
+  src/elpa2/qr/elpa_pdlarfb_template.F90 \
+  src/elpa2/qr/elpa_qrkernels_template.F90 \
+  src/elpa2/qr/qr_utils_template.F90 \
+  src/elpa2/redist_band.F90 \
+  src/elpa_generalized/cannon_forw_template.c \
+  src/elpa_generalized/cannon_back_template.c \
+  src/elpa_index.h \
+  src/fortran_constants.h \
+  src/general/map_global_to_local.F90 \
+  src/general/precision_macros.h \
+  src/general/precision_typedefs.h \
+  src/general/precision_kinds.F90 \
+  test/shared/test_precision_kinds.F90 \
+  src/general/prow_pcol.F90 \
+  src/general/sanity.F90 \
+  src/general/elpa_ssr2_template.F90 \
+  src/general/elpa_ssmv_template.F90 \
+  test/Fortran/assert.h \
+  test/Fortran/elpa_print_headers.F90 \
+  test/shared/test_check_correctness_template.F90 \
+  test/shared/test_prepare_matrix_template.F90 \
+  test/shared/test_analytic_template.F90 \
+  test_project_1stage/Makefile.am \
+  test_project_1stage/autogen.sh \
+  test_project_1stage/configure.ac \
+  test_project_1stage/fdep \
+  test_project_1stage/m4 \
+  test_project_1stage/src/test_real.F90 \
+  test_project_2stage/Makefile.am \
+  test_project_2stage/autogen.sh \
+  test_project_2stage/configure.ac \
+  test_project_2stage/fdep \
+  test_project_2stage/m4 \
+  test_project_2stage/src/test_real2.F90 \
+  test_project_C/Makefile.am \
+  test_project_C/autogen.sh \
+  test_project_C/configure.ac \
+  test_project_C/fdep \
+  test_project_C/m4 \
+  test_project_C/src/test_real.c \
+  test_project_C/src/test_blacs_infrastructure.F90
+
+if WITH_SCALAPACK_TESTS
+EXTRA_DIST += \
+  test/shared/test_scalapack_template.F90
+endif
+
+# python wrapper files
+EXTRA_DIST += python/pyelpa/__init__.py \
+	      python/pyelpa/distributedmatrix.py \
+	      python/pyelpa/wrapper.pyx \
+	      python/tests/test_elpa_import.py \
+	      python/tests/test_mpi4py.py \
+	      python/tests/test_numroc.py \
+	      python/tests/test_with_mpi.py
 
 LIBTOOL_DEPS = @LIBTOOL_DEPS@
 libtool: $(LIBTOOL_DEPS)
diff -Nru elpa-2016.05.001/Makefile.in elpa-2019.11.001/Makefile.in
--- elpa-2016.05.001/Makefile.in	2016-05-20 07:04:38.000000000 +0000
+++ elpa-2019.11.001/Makefile.in	2019-12-21 16:29:48.000000000 +0000
@@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
+# Makefile.in generated by automake 1.16.1 from Makefile.am.
 # @configure_input@
 
-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+# Copyright (C) 1994-2018 Free Software Foundation, Inc.
 
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -92,84 +92,1944 @@
 build_triplet = @build@
 host_triplet = @host@
 @HAVE_DETAILED_TIMINGS_TRUE@am__append_1 = \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/timer.F90 \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/ftimings/ftimings.F90 \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/ftimings/ftimings_type.F90 \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/ftimings/ftimings_value.F90 \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/ftimings/highwater_mark.c \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/ftimings/resident_set_size.c \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/ftimings/time.c \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/ftimings/virtual_memory.c \
-@HAVE_DETAILED_TIMINGS_TRUE@        src/ftimings/papi.c
-
-@WITH_MPI_FALSE@am__append_2 = src/mod_time_c.F90
-@HAVE_DETAILED_TIMINGS_FALSE@@WITH_MPI_FALSE@am__append_3 = src/ftimings/time.c
-@WITH_REAL_GENERIC_KERNEL_TRUE@am__append_4 = src/elpa2_kernels/elpa2_kernels_real.F90
-@WITH_COMPLEX_GENERIC_KERNEL_TRUE@am__append_5 = src/elpa2_kernels/elpa2_kernels_complex.F90
-@WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE@am__append_6 = src/elpa2_kernels/elpa2_kernels_real_simple.F90
-@WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE@am__append_7 = src/elpa2_kernels/elpa2_kernels_complex_simple.F90
-@WITH_REAL_BGP_KERNEL_TRUE@am__append_8 = src/elpa2_kernels/elpa2_kernels_real_bgp.f90
-@WITH_REAL_BGQ_KERNEL_TRUE@am__append_9 = src/elpa2_kernels/elpa2_kernels_real_bgq.f90
-@WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE@am__append_10 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
-@WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE@am__append_11 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
-@WITH_REAL_SSE_BLOCK2_KERNEL_TRUE@am__append_12 = src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
-@WITH_REAL_AVX_BLOCK2_KERNEL_TRUE@am__append_13 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
-@WITH_REAL_SSE_BLOCK4_KERNEL_TRUE@am__append_14 = src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
-@WITH_REAL_AVX_BLOCK4_KERNEL_TRUE@am__append_15 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
-@WITH_REAL_SSE_BLOCK6_KERNEL_TRUE@am__append_16 = src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
-@WITH_REAL_AVX_BLOCK6_KERNEL_TRUE@am__append_17 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
-@WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE@am__append_18 = src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c
-@WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE@am__append_19 = src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c
-@WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE@am__append_20 = src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c
-@WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE@am__append_21 = src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c
-bin_PROGRAMS = elpa2_print_kernels@SUFFIX@$(EXEEXT)
-noinst_PROGRAMS = elpa1_test_real@SUFFIX@$(EXEEXT) \
-	elpa1_test_complex@SUFFIX@$(EXEEXT) \
-	elpa2_test_real@SUFFIX@$(EXEEXT) \
-	elpa2_test_complex@SUFFIX@$(EXEEXT) \
-	elpa2_test_real_default_kernel@SUFFIX@$(EXEEXT) \
-	elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@$(EXEEXT) \
-	elpa2_test_complex_default_kernel@SUFFIX@$(EXEEXT) \
-	elpa2_test_real_choose_kernel_with_api@SUFFIX@$(EXEEXT) \
-	elpa2_test_complex_choose_kernel_with_api@SUFFIX@$(EXEEXT) \
-	elpa1_test_real_with_c@SUFFIX@$(EXEEXT) $(am__EXEEXT_1)
-@WITH_OPENMP_FALSE@am__append_22 = \
-@WITH_OPENMP_FALSE@  elpa1_test_real_c_version@SUFFIX@ \
-@WITH_OPENMP_FALSE@  elpa1_test_complex_c_version@SUFFIX@ \
-@WITH_OPENMP_FALSE@  elpa2_test_real_c_version@SUFFIX@ \
-@WITH_OPENMP_FALSE@  elpa2_test_complex_c_version@SUFFIX@
-
-@WITH_OPENMP_FALSE@am__append_23 = \
-@WITH_OPENMP_FALSE@  elpa1_test_real_c_version@SUFFIX@.sh \
-@WITH_OPENMP_FALSE@  elpa1_test_complex_c_version@SUFFIX@.sh \
-@WITH_OPENMP_FALSE@  elpa2_test_real_c_version@SUFFIX@.sh \
-@WITH_OPENMP_FALSE@  elpa2_test_complex_c_version@SUFFIX@.sh
+@HAVE_DETAILED_TIMINGS_TRUE@  src/ftimings/ftimings.F90 \
+@HAVE_DETAILED_TIMINGS_TRUE@  src/ftimings/ftimings_type.F90 \
+@HAVE_DETAILED_TIMINGS_TRUE@  src/ftimings/ftimings_value.F90 \
+@HAVE_DETAILED_TIMINGS_TRUE@  src/ftimings/highwater_mark.c \
+@HAVE_DETAILED_TIMINGS_TRUE@  src/ftimings/resident_set_size.c \
+@HAVE_DETAILED_TIMINGS_TRUE@  src/ftimings/time.c \
+@HAVE_DETAILED_TIMINGS_TRUE@  src/ftimings/virtual_memory.c \
+@HAVE_DETAILED_TIMINGS_TRUE@  src/ftimings/papi.c
+
+@HAVE_DETAILED_TIMINGS_FALSE@am__append_2 = \
+@HAVE_DETAILED_TIMINGS_FALSE@  src/helpers/timer_dummy.F90
+
+@WITH_GPU_VERSION_TRUE@am__append_3 = src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu
+@WITH_GPU_VERSION_TRUE@am__append_4 = src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu
+@WITH_MPI_FALSE@am__append_5 = src/helpers/mod_time_c.F90
+@HAVE_DETAILED_TIMINGS_FALSE@@WITH_MPI_FALSE@am__append_6 = src/ftimings/time.c
+@HAVE_HETEROGENOUS_CLUSTER_SUPPORT_TRUE@am__append_7 = src/helpers/get_cpuid_set.c src/helpers/mod_simd_kernel.F90
+@WITH_REAL_GENERIC_KERNEL_TRUE@am__append_8 = src/elpa2/kernels/real.F90
+@WITH_COMPLEX_GENERIC_KERNEL_TRUE@am__append_9 = src/elpa2/kernels/complex.F90
+@WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE@am__append_10 = src/elpa2/kernels/real_simple.F90
+@WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE@am__append_11 = src/elpa2/kernels/complex_simple.F90
+@WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_TRUE@am__append_12 = src/elpa2/kernels/real_simple_block4.F90
+@WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_TRUE@am__append_13 = src/elpa2/kernels/real_simple_block6.F90
+@WITH_REAL_BGP_KERNEL_TRUE@am__append_14 = src/elpa2/kernels/real_bgp.f90
+@WITH_REAL_BGQ_KERNEL_TRUE@am__append_15 = src/elpa2/kernels/real_bgq.f90
+@WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE@am__append_16 = src/elpa2/kernels/asm_x86_64_double_precision.s
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE@am__append_17 = src/elpa2/kernels/asm_x86_64_single_precision.s
+@WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE@am__append_18 = src/elpa2/kernels/asm_x86_64_double_precision.s
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE@am__append_19 = src/elpa2/kernels/asm_x86_64_single_precision.s
+@WITH_REAL_SPARC64_BLOCK2_KERNEL_TRUE@am__append_20 = src/elpa2/kernels/real_sparc64_2hv_double_precision.c
+#if WANT_SINGLE_PRECISION_REAL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_2hv_single_precision.c
+#endif
+@WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_TRUE@am__append_21 = src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_TRUE@am__append_22 = src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
+@WITH_REAL_VSX_BLOCK2_KERNEL_TRUE@am__append_23 = src/elpa2/kernels/real_vsx_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_VSX_BLOCK2_KERNEL_TRUE@am__append_24 = src/elpa2/kernels/real_vsx_2hv_single_precision.c
+@WITH_REAL_SSE_BLOCK2_KERNEL_TRUE@am__append_25 = src/elpa2/kernels/real_sse_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_SSE_BLOCK2_KERNEL_TRUE@am__append_26 = src/elpa2/kernels/real_sse_2hv_single_precision.c
+@WITH_REAL_AVX_BLOCK2_KERNEL_TRUE@am__append_27 = src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX_BLOCK2_KERNEL_TRUE@am__append_28 = src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c
+@WITH_REAL_AVX2_BLOCK2_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK2_KERNEL_FALSE@am__append_29 = src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX2_BLOCK2_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK2_KERNEL_FALSE@am__append_30 = src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c
+@WITH_REAL_AVX512_BLOCK2_KERNEL_TRUE@am__append_31 = src/elpa2/kernels/real_avx512_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX512_BLOCK2_KERNEL_TRUE@am__append_32 = src/elpa2/kernels/real_avx512_2hv_single_precision.c
+@WITH_REAL_SPARC64_BLOCK4_KERNEL_TRUE@am__append_33 = src/elpa2/kernels/real_sparc64_4hv_double_precision.c
+#if WANT_SINGLE_PRECISION_REAL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
+#endif
+@WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_TRUE@am__append_34 = src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_TRUE@am__append_35 = src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
+@WITH_REAL_VSX_BLOCK4_KERNEL_TRUE@am__append_36 = src/elpa2/kernels/real_vsx_4hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_VSX_BLOCK4_KERNEL_TRUE@am__append_37 = src/elpa2/kernels/real_vsx_4hv_single_precision.c
+@WITH_REAL_SSE_BLOCK4_KERNEL_TRUE@am__append_38 = src/elpa2/kernels/real_sse_4hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_SSE_BLOCK4_KERNEL_TRUE@am__append_39 = src/elpa2/kernels/real_sse_4hv_single_precision.c
+@WITH_REAL_AVX_BLOCK4_KERNEL_TRUE@am__append_40 = src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX_BLOCK4_KERNEL_TRUE@am__append_41 = src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c
+@WITH_REAL_AVX2_BLOCK4_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK4_KERNEL_FALSE@am__append_42 = src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX2_BLOCK4_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK4_KERNEL_FALSE@am__append_43 = src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c
+@WITH_REAL_AVX512_BLOCK4_KERNEL_TRUE@am__append_44 = src/elpa2/kernels/real_avx512_4hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX512_BLOCK4_KERNEL_TRUE@am__append_45 = src/elpa2/kernels/real_avx512_4hv_single_precision.c
+@WITH_REAL_SPARC64_BLOCK6_KERNEL_TRUE@am__append_46 = src/elpa2/kernels/real_sparc64_6hv_double_precision.c
+#if WANT_SINGLE_PRECISION_REAL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
+#endif
+@WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_TRUE@am__append_47 = src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_TRUE@am__append_48 = src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
+@WITH_REAL_VSX_BLOCK6_KERNEL_TRUE@am__append_49 = src/elpa2/kernels/real_vsx_6hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_VSX_BLOCK6_KERNEL_TRUE@am__append_50 = src/elpa2/kernels/real_vsx_6hv_single_precision.c
+@WITH_REAL_SSE_BLOCK6_KERNEL_TRUE@am__append_51 = src/elpa2/kernels/real_sse_6hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_SSE_BLOCK6_KERNEL_TRUE@am__append_52 = src/elpa2/kernels/real_sse_6hv_single_precision.c
+@WITH_REAL_AVX_BLOCK6_KERNEL_TRUE@am__append_53 = src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX_BLOCK6_KERNEL_TRUE@am__append_54 = src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c
+@WITH_REAL_AVX2_BLOCK6_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK6_KERNEL_FALSE@am__append_55 = src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX2_BLOCK6_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK6_KERNEL_FALSE@am__append_56 = src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c
+@WITH_REAL_AVX512_BLOCK6_KERNEL_TRUE@am__append_57 = src/elpa2/kernels/real_avx512_6hv_double_precision.c
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX512_BLOCK6_KERNEL_TRUE@am__append_58 = src/elpa2/kernels/real_avx512_6hv_single_precision.c
+
+#if WITH_COMPLEX_SPARC64_BLOCK1_KERNEL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_double_precision.c
+#if WANT_SINGLE_PRECISION_COMPLEX
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_1hv_single_precision.c
+#endif
+#endif
+#
+#if WITH_COMPLEX_VSX_BLOCK1_KERNEL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_double_precision.c
+#if WANT_SINGLE_PRECISION_COMPLEX
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_1hv_single_precision.c
+#endif
+#endif
+@WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE@am__append_59 = src/elpa2/kernels/complex_sse_1hv_double_precision.c
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE@am__append_60 = src/elpa2/kernels/complex_sse_1hv_single_precision.c
+@WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE@am__append_61 = src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE@am__append_62 = src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c
+@WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE@@WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE@am__append_63 = src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE@@WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE@am__append_64 = src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c
+@WITH_COMPLEX_AVX512_BLOCK1_KERNEL_TRUE@am__append_65 = src/elpa2/kernels/complex_avx512_1hv_double_precision.c
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX512_BLOCK1_KERNEL_TRUE@am__append_66 = src/elpa2/kernels/complex_avx512_1hv_single_precision.c
+
+#if WITH_COMPLEX_SPARC64_BLOCK2_KERNEL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_double_precision.c
+#if WANT_SINGLE_PRECISION_COMPLEX
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_sparc64_2hv_single_precision.c
+#endif
+#endif
+#
+#if WITH_COMPLEX_VSX_BLOCK2_KERNEL
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_double_precision.c
+#if WANT_SINGLE_PRECISION_COMPLEX
+#  libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/complex_vsx_2hv_single_precision.c
+#endif
+#endif
+@WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE@am__append_67 = src/elpa2/kernels/complex_sse_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE@am__append_68 = src/elpa2/kernels/complex_sse_2hv_single_precision.c
+@WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE@am__append_69 = src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE@am__append_70 = src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c
+@WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE@@WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE@am__append_71 = src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE@@WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE@am__append_72 = src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c
+@WITH_COMPLEX_AVX512_BLOCK2_KERNEL_TRUE@am__append_73 = src/elpa2/kernels/complex_avx512_2hv_double_precision.c
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX512_BLOCK2_KERNEL_TRUE@am__append_74 = src/elpa2/kernels/complex_avx512_2hv_single_precision.c
+@STORE_BUILD_CONFIG_TRUE@am__append_75 = src/helpers/print_build_config.c
+bin_PROGRAMS = elpa2_print_kernels@SUFFIX@$(EXEEXT) $(am__EXEEXT_1) \
+	$(am__EXEEXT_2) $(am__EXEEXT_3) $(am__EXEEXT_4) \
+	$(am__EXEEXT_5) $(am__EXEEXT_6) $(am__EXEEXT_7) \
+	$(am__EXEEXT_8) $(am__EXEEXT_9) $(am__EXEEXT_10) \
+	$(am__EXEEXT_11) $(am__EXEEXT_12) $(am__EXEEXT_13) \
+	$(am__EXEEXT_14) $(am__EXEEXT_15) $(am__EXEEXT_16) \
+	$(am__EXEEXT_17) $(am__EXEEXT_18) $(am__EXEEXT_19) \
+	$(am__EXEEXT_20) $(am__EXEEXT_21) $(am__EXEEXT_22) \
+	$(am__EXEEXT_23) $(am__EXEEXT_24) $(am__EXEEXT_25) \
+	$(am__EXEEXT_26) $(am__EXEEXT_27) $(am__EXEEXT_28) \
+	$(am__EXEEXT_29) $(am__EXEEXT_30) $(am__EXEEXT_31) \
+	$(am__EXEEXT_32) $(am__EXEEXT_33) $(am__EXEEXT_34) \
+	$(am__EXEEXT_35) $(am__EXEEXT_36) $(am__EXEEXT_37) \
+	$(am__EXEEXT_38) $(am__EXEEXT_39) $(am__EXEEXT_40) \
+	$(am__EXEEXT_41) $(am__EXEEXT_42) $(am__EXEEXT_43) \
+	$(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
+	$(am__EXEEXT_47) $(am__EXEEXT_48) $(am__EXEEXT_49) \
+	$(am__EXEEXT_50) $(am__EXEEXT_51) $(am__EXEEXT_52) \
+	$(am__EXEEXT_53) $(am__EXEEXT_54) $(am__EXEEXT_55) \
+	$(am__EXEEXT_56) $(am__EXEEXT_57) $(am__EXEEXT_58) \
+	$(am__EXEEXT_59) $(am__EXEEXT_60) $(am__EXEEXT_61) \
+	$(am__EXEEXT_62) $(am__EXEEXT_63) $(am__EXEEXT_64) \
+	$(am__EXEEXT_65) $(am__EXEEXT_66) $(am__EXEEXT_67) \
+	$(am__EXEEXT_68) $(am__EXEEXT_69) $(am__EXEEXT_70) \
+	$(am__EXEEXT_71) $(am__EXEEXT_72) $(am__EXEEXT_73) \
+	$(am__EXEEXT_74) $(am__EXEEXT_75) $(am__EXEEXT_76) \
+	$(am__EXEEXT_77) $(am__EXEEXT_78) $(am__EXEEXT_79) \
+	$(am__EXEEXT_80) $(am__EXEEXT_81) $(am__EXEEXT_82) \
+	$(am__EXEEXT_83) $(am__EXEEXT_84) $(am__EXEEXT_85) \
+	$(am__EXEEXT_86) $(am__EXEEXT_87) $(am__EXEEXT_88) \
+	$(am__EXEEXT_89) $(am__EXEEXT_90) $(am__EXEEXT_91) \
+	$(am__EXEEXT_92) $(am__EXEEXT_93) $(am__EXEEXT_94) \
+	$(am__EXEEXT_95) $(am__EXEEXT_96) $(am__EXEEXT_97) \
+	$(am__EXEEXT_98) $(am__EXEEXT_99) $(am__EXEEXT_100) \
+	$(am__EXEEXT_101) $(am__EXEEXT_102) $(am__EXEEXT_103) \
+	$(am__EXEEXT_104) $(am__EXEEXT_105) $(am__EXEEXT_106) \
+	$(am__EXEEXT_107) $(am__EXEEXT_108) $(am__EXEEXT_109) \
+	$(am__EXEEXT_110) $(am__EXEEXT_111) $(am__EXEEXT_112) \
+	$(am__EXEEXT_113) $(am__EXEEXT_114) $(am__EXEEXT_115) \
+	$(am__EXEEXT_116) $(am__EXEEXT_117) $(am__EXEEXT_118) \
+	$(am__EXEEXT_119) $(am__EXEEXT_120) $(am__EXEEXT_121) \
+	$(am__EXEEXT_122) $(am__EXEEXT_123) $(am__EXEEXT_124) \
+	$(am__EXEEXT_125) $(am__EXEEXT_126) $(am__EXEEXT_127) \
+	$(am__EXEEXT_128) $(am__EXEEXT_129) $(am__EXEEXT_130) \
+	$(am__EXEEXT_131) $(am__EXEEXT_132) $(am__EXEEXT_133) \
+	$(am__EXEEXT_134) $(am__EXEEXT_135) $(am__EXEEXT_136) \
+	$(am__EXEEXT_137) $(am__EXEEXT_138) $(am__EXEEXT_139) \
+	$(am__EXEEXT_140) $(am__EXEEXT_141) $(am__EXEEXT_142) \
+	$(am__EXEEXT_143) $(am__EXEEXT_144) $(am__EXEEXT_145) \
+	$(am__EXEEXT_146) $(am__EXEEXT_147) $(am__EXEEXT_148) \
+	$(am__EXEEXT_149) $(am__EXEEXT_150) $(am__EXEEXT_151) \
+	$(am__EXEEXT_152) $(am__EXEEXT_153) $(am__EXEEXT_154) \
+	$(am__EXEEXT_155) $(am__EXEEXT_156) $(am__EXEEXT_157) \
+	$(am__EXEEXT_158) $(am__EXEEXT_159) $(am__EXEEXT_160) \
+	$(am__EXEEXT_161) $(am__EXEEXT_162) $(am__EXEEXT_163) \
+	$(am__EXEEXT_164) $(am__EXEEXT_165) $(am__EXEEXT_166) \
+	$(am__EXEEXT_167) $(am__EXEEXT_168) $(am__EXEEXT_169) \
+	$(am__EXEEXT_170) $(am__EXEEXT_171) $(am__EXEEXT_172) \
+	$(am__EXEEXT_173) $(am__EXEEXT_174) $(am__EXEEXT_175) \
+	$(am__EXEEXT_176) $(am__EXEEXT_177) $(am__EXEEXT_178) \
+	$(am__EXEEXT_179) $(am__EXEEXT_180) $(am__EXEEXT_181) \
+	$(am__EXEEXT_182) $(am__EXEEXT_183) $(am__EXEEXT_184) \
+	$(am__EXEEXT_185) $(am__EXEEXT_186) $(am__EXEEXT_187) \
+	$(am__EXEEXT_188) $(am__EXEEXT_189) $(am__EXEEXT_190) \
+	$(am__EXEEXT_191) $(am__EXEEXT_192) $(am__EXEEXT_193) \
+	$(am__EXEEXT_194) $(am__EXEEXT_195) $(am__EXEEXT_196) \
+	$(am__EXEEXT_197) $(am__EXEEXT_198) $(am__EXEEXT_199) \
+	$(am__EXEEXT_200) $(am__EXEEXT_201) $(am__EXEEXT_202) \
+	$(am__EXEEXT_203) $(am__EXEEXT_204) $(am__EXEEXT_205) \
+	$(am__EXEEXT_206) $(am__EXEEXT_207) $(am__EXEEXT_208) \
+	$(am__EXEEXT_209) $(am__EXEEXT_210) $(am__EXEEXT_211) \
+	$(am__EXEEXT_212) $(am__EXEEXT_213) $(am__EXEEXT_214) \
+	$(am__EXEEXT_215) $(am__EXEEXT_216) $(am__EXEEXT_217) \
+	$(am__EXEEXT_218) $(am__EXEEXT_219) $(am__EXEEXT_220) \
+	$(am__EXEEXT_221) $(am__EXEEXT_222) $(am__EXEEXT_223) \
+	$(am__EXEEXT_224) $(am__EXEEXT_225) $(am__EXEEXT_226) \
+	$(am__EXEEXT_227) $(am__EXEEXT_228) $(am__EXEEXT_229) \
+	$(am__EXEEXT_230) $(am__EXEEXT_231) $(am__EXEEXT_232) \
+	$(am__EXEEXT_233) $(am__EXEEXT_234) $(am__EXEEXT_235) \
+	$(am__EXEEXT_236) $(am__EXEEXT_237) $(am__EXEEXT_238) \
+	$(am__EXEEXT_239) $(am__EXEEXT_240) $(am__EXEEXT_241) \
+	$(am__EXEEXT_242) $(am__EXEEXT_243) $(am__EXEEXT_244) \
+	$(am__EXEEXT_245) $(am__EXEEXT_246) $(am__EXEEXT_247) \
+	$(am__EXEEXT_248) $(am__EXEEXT_249) $(am__EXEEXT_250) \
+	$(am__EXEEXT_251) $(am__EXEEXT_252) $(am__EXEEXT_253) \
+	$(am__EXEEXT_254)
+noinst_PROGRAMS = $(am__EXEEXT_255) $(am__EXEEXT_256) \
+	$(am__EXEEXT_257) $(am__EXEEXT_258) $(am__EXEEXT_259) \
+	$(am__EXEEXT_260) $(am__EXEEXT_261) $(am__EXEEXT_262) \
+	$(am__EXEEXT_263) $(am__EXEEXT_264) $(am__EXEEXT_265) \
+	$(am__EXEEXT_266) $(am__EXEEXT_267) $(am__EXEEXT_268) \
+	$(am__EXEEXT_269) $(am__EXEEXT_270) $(am__EXEEXT_271) \
+	$(am__EXEEXT_272) $(am__EXEEXT_273) $(am__EXEEXT_274) \
+	$(am__EXEEXT_275) $(am__EXEEXT_276) $(am__EXEEXT_277) \
+	$(am__EXEEXT_278) $(am__EXEEXT_279) $(am__EXEEXT_280) \
+	$(am__EXEEXT_281) $(am__EXEEXT_282) $(am__EXEEXT_283) \
+	$(am__EXEEXT_284) $(am__EXEEXT_285) $(am__EXEEXT_286) \
+	$(am__EXEEXT_287) $(am__EXEEXT_288) $(am__EXEEXT_289) \
+	$(am__EXEEXT_290) $(am__EXEEXT_291) $(am__EXEEXT_292) \
+	$(am__EXEEXT_293) $(am__EXEEXT_294) $(am__EXEEXT_295) \
+	$(am__EXEEXT_296) $(am__EXEEXT_297) $(am__EXEEXT_298) \
+	$(am__EXEEXT_299) $(am__EXEEXT_300) $(am__EXEEXT_301) \
+	$(am__EXEEXT_302) $(am__EXEEXT_303) $(am__EXEEXT_304) \
+	$(am__EXEEXT_305) $(am__EXEEXT_306) $(am__EXEEXT_307) \
+	$(am__EXEEXT_308) $(am__EXEEXT_309) $(am__EXEEXT_310) \
+	$(am__EXEEXT_311) $(am__EXEEXT_312) $(am__EXEEXT_313) \
+	$(am__EXEEXT_314) $(am__EXEEXT_315) $(am__EXEEXT_316) \
+	$(am__EXEEXT_317) $(am__EXEEXT_318) $(am__EXEEXT_319) \
+	$(am__EXEEXT_320) $(am__EXEEXT_321) $(am__EXEEXT_322) \
+	$(am__EXEEXT_323) $(am__EXEEXT_324) $(am__EXEEXT_325) \
+	$(am__EXEEXT_326) $(am__EXEEXT_327) $(am__EXEEXT_328) \
+	$(am__EXEEXT_329) $(am__EXEEXT_330) $(am__EXEEXT_331) \
+	$(am__EXEEXT_332) $(am__EXEEXT_333) $(am__EXEEXT_334) \
+	$(am__EXEEXT_335) $(am__EXEEXT_336) $(am__EXEEXT_337) \
+	$(am__EXEEXT_338) $(am__EXEEXT_339) $(am__EXEEXT_340) \
+	$(am__EXEEXT_341) $(am__EXEEXT_342) $(am__EXEEXT_343) \
+	$(am__EXEEXT_344) $(am__EXEEXT_345) $(am__EXEEXT_346) \
+	$(am__EXEEXT_347) $(am__EXEEXT_348) $(am__EXEEXT_349) \
+	$(am__EXEEXT_350) $(am__EXEEXT_351) $(am__EXEEXT_352) \
+	$(am__EXEEXT_353) $(am__EXEEXT_354) $(am__EXEEXT_355) \
+	$(am__EXEEXT_356) $(am__EXEEXT_357) $(am__EXEEXT_358) \
+	$(am__EXEEXT_359) $(am__EXEEXT_360) $(am__EXEEXT_361) \
+	$(am__EXEEXT_362) $(am__EXEEXT_363) $(am__EXEEXT_364) \
+	$(am__EXEEXT_365) $(am__EXEEXT_366) $(am__EXEEXT_367) \
+	$(am__EXEEXT_368) $(am__EXEEXT_369) $(am__EXEEXT_370) \
+	$(am__EXEEXT_371) $(am__EXEEXT_372) $(am__EXEEXT_373) \
+	$(am__EXEEXT_374) $(am__EXEEXT_375) $(am__EXEEXT_376) \
+	$(am__EXEEXT_377) $(am__EXEEXT_378) $(am__EXEEXT_379) \
+	$(am__EXEEXT_380) $(am__EXEEXT_381) $(am__EXEEXT_382) \
+	$(am__EXEEXT_383) $(am__EXEEXT_384) $(am__EXEEXT_385) \
+	$(am__EXEEXT_386) $(am__EXEEXT_387) $(am__EXEEXT_388) \
+	$(am__EXEEXT_389) $(am__EXEEXT_390) $(am__EXEEXT_391) \
+	$(am__EXEEXT_392) $(am__EXEEXT_393) $(am__EXEEXT_394) \
+	$(am__EXEEXT_395) $(am__EXEEXT_396) $(am__EXEEXT_397) \
+	$(am__EXEEXT_398) $(am__EXEEXT_399) $(am__EXEEXT_400) \
+	$(am__EXEEXT_401) $(am__EXEEXT_402) $(am__EXEEXT_403) \
+	$(am__EXEEXT_404) $(am__EXEEXT_405) $(am__EXEEXT_406) \
+	$(am__EXEEXT_407) $(am__EXEEXT_408) $(am__EXEEXT_409) \
+	$(am__EXEEXT_410) $(am__EXEEXT_411) $(am__EXEEXT_412) \
+	$(am__EXEEXT_413) $(am__EXEEXT_414) $(am__EXEEXT_415) \
+	$(am__EXEEXT_416) $(am__EXEEXT_417) $(am__EXEEXT_418) \
+	$(am__EXEEXT_419) $(am__EXEEXT_420) $(am__EXEEXT_421) \
+	$(am__EXEEXT_422) $(am__EXEEXT_423) $(am__EXEEXT_424) \
+	$(am__EXEEXT_425) $(am__EXEEXT_426) $(am__EXEEXT_427) \
+	$(am__EXEEXT_428) $(am__EXEEXT_429) $(am__EXEEXT_430) \
+	$(am__EXEEXT_431) $(am__EXEEXT_432) $(am__EXEEXT_433) \
+	$(am__EXEEXT_434) $(am__EXEEXT_435) $(am__EXEEXT_436) \
+	$(am__EXEEXT_437) $(am__EXEEXT_438) $(am__EXEEXT_439) \
+	$(am__EXEEXT_440) $(am__EXEEXT_441) $(am__EXEEXT_442) \
+	$(am__EXEEXT_443) $(am__EXEEXT_444) $(am__EXEEXT_445) \
+	$(am__EXEEXT_446) $(am__EXEEXT_447) $(am__EXEEXT_448) \
+	$(am__EXEEXT_449) $(am__EXEEXT_450) $(am__EXEEXT_451) \
+	$(am__EXEEXT_452) $(am__EXEEXT_453) $(am__EXEEXT_454) \
+	$(am__EXEEXT_455) $(am__EXEEXT_456) $(am__EXEEXT_457) \
+	$(am__EXEEXT_458) $(am__EXEEXT_459) $(am__EXEEXT_460) \
+	$(am__EXEEXT_461) $(am__EXEEXT_462) $(am__EXEEXT_463) \
+	$(am__EXEEXT_464) $(am__EXEEXT_465) $(am__EXEEXT_466) \
+	$(am__EXEEXT_467) $(am__EXEEXT_468) $(am__EXEEXT_469) \
+	$(am__EXEEXT_470) $(am__EXEEXT_471) $(am__EXEEXT_472) \
+	$(am__EXEEXT_473) $(am__EXEEXT_474) $(am__EXEEXT_475) \
+	$(am__EXEEXT_476) $(am__EXEEXT_477) $(am__EXEEXT_478) \
+	$(am__EXEEXT_479) $(am__EXEEXT_480) $(am__EXEEXT_481) \
+	$(am__EXEEXT_482) $(am__EXEEXT_483) $(am__EXEEXT_484) \
+	$(am__EXEEXT_485) $(am__EXEEXT_486) $(am__EXEEXT_487) \
+	$(am__EXEEXT_488) $(am__EXEEXT_489) $(am__EXEEXT_490) \
+	$(am__EXEEXT_491) $(am__EXEEXT_492) $(am__EXEEXT_493) \
+	$(am__EXEEXT_494) $(am__EXEEXT_495) $(am__EXEEXT_496) \
+	$(am__EXEEXT_497) $(am__EXEEXT_498) $(am__EXEEXT_499) \
+	$(am__EXEEXT_500) $(am__EXEEXT_501) $(am__EXEEXT_502) \
+	$(am__EXEEXT_503) $(am__EXEEXT_504) $(am__EXEEXT_505) \
+	$(am__EXEEXT_506) $(am__EXEEXT_507) $(am__EXEEXT_508) \
+	$(am__EXEEXT_509) $(am__EXEEXT_510) $(am__EXEEXT_511) \
+	$(am__EXEEXT_512) $(am__EXEEXT_513) $(am__EXEEXT_514) \
+	$(am__EXEEXT_515) test_skewsymmetric_real_double$(EXEEXT) \
+	$(am__EXEEXT_516) $(am__EXEEXT_517) \
+	validate_split_comm_real_double$(EXEEXT) \
+	validate_double_instance@SUFFIX@$(EXEEXT) \
+	validate_real_2stage_banded@SUFFIX@$(EXEEXT) \
+	validate_complex_2stage_banded@SUFFIX@$(EXEEXT) \
+	$(am__EXEEXT_518) $(am__EXEEXT_519)
+@WITH_SCALAPACK_TESTS_TRUE@am__append_76 = \
+@WITH_SCALAPACK_TESTS_TRUE@  test/shared/test_scalapack.F90
+
+@HAVE_REDIRECT_TRUE@am__append_77 = \
+@HAVE_REDIRECT_TRUE@  test/shared/test_redir.c \
+@HAVE_REDIRECT_TRUE@  test/shared/test_redirect.F90
+
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@am__append_78 = validate_c_version_complex_double_eigenvectors_1stage_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_1stage_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@am__append_79 = validate_c_version_complex_double_eigenvectors_1stage_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_1stage_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_random
+@ENABLE_C_TESTS_TRUE@am__append_80 = validate_c_version_complex_double_eigenvectors_1stage_random_default.sh \
+@ENABLE_C_TESTS_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_default.sh \
+@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_1stage_random_default.sh \
+@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_81 = validate_c_version_complex_single_eigenvectors_1stage_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_82 = validate_c_version_complex_single_eigenvectors_1stage_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_83 = validate_c_version_complex_single_eigenvectors_1stage_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_84 = validate_c_version_real_single_eigenvectors_1stage_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_85 = validate_c_version_real_single_eigenvectors_1stage_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_86 = validate_c_version_real_single_eigenvectors_1stage_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@am__append_87 = validate_c_version_complex_double_generalized_1stage_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_1stage_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@am__append_88 = validate_c_version_complex_double_generalized_1stage_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_1stage_random
+@ENABLE_C_TESTS_TRUE@am__append_89 = validate_c_version_complex_double_generalized_1stage_random_default.sh \
+@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_90 = validate_c_version_complex_single_generalized_1stage_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_91 = validate_c_version_complex_single_generalized_1stage_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_92 = validate_c_version_complex_single_generalized_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_93 = validate_c_version_real_single_generalized_1stage_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_94 = validate_c_version_real_single_generalized_1stage_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_95 = validate_c_version_real_single_generalized_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@am__append_96 = validate_c_version_complex_double_generalized_decomp_1stage_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@am__append_97 = validate_c_version_complex_double_generalized_decomp_1stage_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_random
+@ENABLE_C_TESTS_TRUE@am__append_98 = validate_c_version_complex_double_generalized_decomp_1stage_random_default.sh \
+@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_99 = validate_c_version_complex_single_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_100 = validate_c_version_complex_single_generalized_decomp_1stage_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_101 = validate_c_version_complex_single_generalized_decomp_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_102 = validate_c_version_real_single_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_103 = validate_c_version_real_single_generalized_decomp_1stage_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_104 = validate_c_version_real_single_generalized_decomp_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_105 = validate_c_version_complex_double_eigenvectors_1stage_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_1stage_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_106 = validate_c_version_complex_double_eigenvectors_1stage_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_1stage_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_107 = validate_c_version_complex_double_eigenvectors_1stage_gpu_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_1stage_gpu_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_108 = validate_c_version_complex_single_eigenvectors_1stage_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_109 = validate_c_version_complex_single_eigenvectors_1stage_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_110 = validate_c_version_complex_single_eigenvectors_1stage_gpu_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_111 = validate_c_version_real_single_eigenvectors_1stage_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_112 = validate_c_version_real_single_eigenvectors_1stage_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_113 = validate_c_version_real_single_eigenvectors_1stage_gpu_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_114 = validate_c_version_complex_double_generalized_1stage_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_115 = validate_c_version_complex_double_generalized_1stage_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_1stage_gpu_random
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_116 = validate_c_version_complex_double_generalized_1stage_gpu_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_117 = validate_c_version_complex_single_generalized_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_118 = validate_c_version_complex_single_generalized_1stage_gpu_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_119 = validate_c_version_complex_single_generalized_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_120 = validate_c_version_real_single_generalized_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_121 = validate_c_version_real_single_generalized_1stage_gpu_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_122 = validate_c_version_real_single_generalized_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_123 = validate_c_version_complex_double_generalized_decomp_1stage_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_124 = validate_c_version_complex_double_generalized_decomp_1stage_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_gpu_random
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__append_125 = validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_default.sh \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_126 = validate_c_version_complex_single_generalized_decomp_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_127 = validate_c_version_complex_single_generalized_decomp_1stage_gpu_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_128 = validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_129 = validate_c_version_real_single_generalized_decomp_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_130 = validate_c_version_real_single_generalized_decomp_1stage_gpu_random
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_131 = validate_c_version_real_single_generalized_decomp_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_132 = validate_complex_double_eigenvectors_1stage_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_133 = validate_complex_double_eigenvectors_1stage_analytic_all_layouts
+@WITH_MPI_TRUE@am__append_134 = validate_complex_double_eigenvectors_1stage_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_135 = validate_complex_double_eigenvectors_1stage_analytic
+@BUILD_KCOMPUTER_FALSE@am__append_136 = validate_complex_double_eigenvectors_1stage_analytic
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_137 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_138 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts
+@WITH_MPI_TRUE@am__append_139 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh \
+@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_140 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic \
+@BUILD_KCOMPUTER_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic
+@BUILD_KCOMPUTER_FALSE@am__append_141 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic \
+@BUILD_KCOMPUTER_FALSE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_142 = validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_143 = validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_144 = validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_145 = validate_complex_double_eigenvectors_scalapack_all_analytic
+@BUILD_KCOMPUTER_FALSE@@WITH_SCALAPACK_TESTS_TRUE@am__append_146 = validate_complex_double_eigenvectors_scalapack_all_analytic
+@WITH_SCALAPACK_TESTS_TRUE@am__append_147 = validate_complex_double_eigenvectors_scalapack_all_analytic_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_148 = validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_149 = validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_150 = validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_151 = validate_complex_double_eigenvectors_scalapack_part_analytic
+@BUILD_KCOMPUTER_FALSE@@WITH_SCALAPACK_TESTS_TRUE@am__append_152 = validate_complex_double_eigenvectors_scalapack_part_analytic
+@WITH_SCALAPACK_TESTS_TRUE@am__append_153 = validate_complex_double_eigenvectors_scalapack_part_analytic_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_154 = validate_real_double_eigenvectors_1stage_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_155 = validate_real_double_eigenvectors_1stage_analytic_all_layouts
+@WITH_MPI_TRUE@am__append_156 = validate_real_double_eigenvectors_1stage_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_157 = validate_real_double_eigenvectors_1stage_analytic
+@BUILD_KCOMPUTER_FALSE@am__append_158 = validate_real_double_eigenvectors_1stage_analytic
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_159 = validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_160 = validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts
+@WITH_MPI_TRUE@am__append_161 = validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh \
+@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_162 = validate_real_double_eigenvectors_2stage_all_kernels_analytic \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic
+@BUILD_KCOMPUTER_FALSE@am__append_163 = validate_real_double_eigenvectors_2stage_all_kernels_analytic \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_164 = validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_165 = validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_166 = validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_167 = validate_real_double_eigenvectors_scalapack_all_analytic
+@BUILD_KCOMPUTER_FALSE@@WITH_SCALAPACK_TESTS_TRUE@am__append_168 = validate_real_double_eigenvectors_scalapack_all_analytic
+@WITH_SCALAPACK_TESTS_TRUE@am__append_169 = validate_real_double_eigenvectors_scalapack_all_analytic_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_170 = validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_171 = validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_172 = validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__append_173 = validate_real_double_eigenvectors_scalapack_part_analytic
+@BUILD_KCOMPUTER_FALSE@@WITH_SCALAPACK_TESTS_TRUE@am__append_174 = validate_real_double_eigenvectors_scalapack_part_analytic
+@WITH_SCALAPACK_TESTS_TRUE@am__append_175 = validate_real_double_eigenvectors_scalapack_part_analytic_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_176 = validate_complex_single_eigenvectors_1stage_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_177 = validate_complex_single_eigenvectors_1stage_analytic_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_178 = validate_complex_single_eigenvectors_1stage_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_179 = validate_complex_single_eigenvectors_1stage_analytic
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_180 = validate_complex_single_eigenvectors_1stage_analytic
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_181 = validate_complex_single_eigenvectors_1stage_analytic_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_182 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_183 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_184 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_185 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_186 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_187 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_188 = validate_real_single_eigenvectors_1stage_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_189 = validate_real_single_eigenvectors_1stage_analytic_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_190 = validate_real_single_eigenvectors_1stage_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_191 = validate_real_single_eigenvectors_1stage_analytic
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_192 = validate_real_single_eigenvectors_1stage_analytic
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_193 = validate_real_single_eigenvectors_1stage_analytic_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_194 = validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_195 = validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_196 = validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_197 = validate_real_single_eigenvectors_2stage_all_kernels_analytic \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_198 = validate_real_single_eigenvectors_2stage_all_kernels_analytic \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_199 = validate_real_single_eigenvectors_2stage_all_kernels_analytic_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_200 = validate_real_double_eigenvalues_1stage_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_201 = validate_real_double_eigenvalues_1stage_frank_all_layouts
+@WITH_MPI_TRUE@am__append_202 = validate_real_double_eigenvalues_1stage_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_203 = validate_real_double_eigenvalues_1stage_frank
+@BUILD_KCOMPUTER_FALSE@am__append_204 = validate_real_double_eigenvalues_1stage_frank
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_205 = validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_206 = validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts
+@WITH_MPI_TRUE@am__append_207 = validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_208 = validate_real_double_eigenvalues_2stage_default_kernel_frank
+@BUILD_KCOMPUTER_FALSE@am__append_209 = validate_real_double_eigenvalues_2stage_default_kernel_frank
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_210 = validate_real_double_eigenvectors_1stage_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_211 = validate_real_double_eigenvectors_1stage_frank_all_layouts
+@WITH_MPI_TRUE@am__append_212 = validate_real_double_eigenvectors_1stage_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_213 = validate_real_double_eigenvectors_1stage_frank
+@BUILD_KCOMPUTER_FALSE@am__append_214 = validate_real_double_eigenvectors_1stage_frank
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_215 = validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_216 = validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts
+@WITH_MPI_TRUE@am__append_217 = validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_extended.sh \
+@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_218 = validate_real_double_eigenvectors_2stage_all_kernels_frank \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_frank
+@BUILD_KCOMPUTER_FALSE@am__append_219 = validate_real_double_eigenvectors_2stage_all_kernels_frank \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_frank
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_220 = validate_real_double_hermitian_multiply_1stage_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_221 = validate_real_double_hermitian_multiply_1stage_frank_all_layouts
+@WITH_MPI_TRUE@am__append_222 = validate_real_double_hermitian_multiply_1stage_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_223 = validate_real_double_hermitian_multiply_1stage_frank
+@BUILD_KCOMPUTER_FALSE@am__append_224 = validate_real_double_hermitian_multiply_1stage_frank
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_225 = validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_226 = validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_227 = validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_228 = validate_real_double_eigenvalues_1stage_gpu_frank
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_229 = validate_real_double_eigenvalues_1stage_gpu_frank
+@WITH_GPU_VERSION_TRUE@am__append_230 = validate_real_double_eigenvalues_1stage_gpu_frank_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_231 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_232 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_233 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_234 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_235 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank
+@WITH_GPU_VERSION_TRUE@am__append_236 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_237 = validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_238 = validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_239 = validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_240 = validate_real_double_eigenvectors_1stage_gpu_frank
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_241 = validate_real_double_eigenvectors_1stage_gpu_frank
+@WITH_GPU_VERSION_TRUE@am__append_242 = validate_real_double_eigenvectors_1stage_gpu_frank_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_243 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_244 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_245 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_extended.sh \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_246 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_247 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank
+@WITH_GPU_VERSION_TRUE@am__append_248 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_extended.sh \
+@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_249 = validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_250 = validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_251 = validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_252 = validate_real_double_hermitian_multiply_1stage_gpu_frank
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_253 = validate_real_double_hermitian_multiply_1stage_gpu_frank
+@WITH_GPU_VERSION_TRUE@am__append_254 = validate_real_double_hermitian_multiply_1stage_gpu_frank_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_255 = validate_complex_double_cholesky_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_256 = validate_complex_double_cholesky_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_257 = validate_complex_double_cholesky_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_258 = validate_complex_double_cholesky_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_259 = validate_complex_double_cholesky_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_260 = validate_real_double_cholesky_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_261 = validate_real_double_cholesky_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_262 = validate_real_double_cholesky_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_263 = validate_real_double_cholesky_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_264 = validate_real_double_cholesky_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_265 = validate_real_double_cholesky_1stage_random_split_comm_myself
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_266 = validate_real_double_cholesky_1stage_random_split_comm_myself
+@WITH_MPI_TRUE@am__append_267 = validate_real_double_cholesky_1stage_random_split_comm_myself_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_268 = validate_complex_single_cholesky_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_269 = validate_complex_single_cholesky_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_270 = validate_complex_single_cholesky_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_271 = validate_complex_single_cholesky_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_272 = validate_complex_single_cholesky_1stage_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_273 = validate_complex_single_cholesky_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_274 = validate_real_single_cholesky_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_275 = validate_real_single_cholesky_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_276 = validate_real_single_cholesky_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_277 = validate_real_single_cholesky_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_278 = validate_real_single_cholesky_1stage_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_279 = validate_real_single_cholesky_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_280 = validate_complex_double_eigenvectors_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_281 = validate_complex_double_eigenvectors_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_282 = validate_complex_double_eigenvectors_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_283 = validate_complex_double_eigenvectors_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_284 = validate_complex_double_eigenvectors_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_285 = validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_286 = validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts
+@WITH_MPI_TRUE@am__append_287 = validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh \
+@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_288 = validate_complex_double_eigenvectors_2stage_all_kernels_random \
+@BUILD_KCOMPUTER_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_FALSE@am__append_289 = validate_complex_double_eigenvectors_2stage_all_kernels_random \
+@BUILD_KCOMPUTER_FALSE@	validate_complex_double_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_290 = validate_real_double_eigenvectors_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_291 = validate_real_double_eigenvectors_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_292 = validate_real_double_eigenvectors_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_293 = validate_real_double_eigenvectors_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_294 = validate_real_double_eigenvectors_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_295 = validate_real_double_eigenvectors_1stage_random_split_comm_myself \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_296 = validate_real_double_eigenvectors_1stage_random_split_comm_myself \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts
+@WITH_MPI_TRUE@am__append_297 = validate_real_double_eigenvectors_1stage_random_split_comm_myself_default.sh \
+@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh \
+@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_298 = validate_real_double_eigenvectors_2stage_all_kernels_random \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_FALSE@am__append_299 = validate_real_double_eigenvectors_2stage_all_kernels_random \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_300 = validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_301 = validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself
+@WITH_MPI_TRUE@am__append_302 = validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_303 = validate_complex_single_eigenvectors_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_304 = validate_complex_single_eigenvectors_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_305 = validate_complex_single_eigenvectors_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_306 = validate_complex_single_eigenvectors_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_307 = validate_complex_single_eigenvectors_1stage_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_308 = validate_complex_single_eigenvectors_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_309 = validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_310 = validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_311 = validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_312 = validate_complex_single_eigenvectors_2stage_all_kernels_random \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_313 = validate_complex_single_eigenvectors_2stage_all_kernels_random \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_314 = validate_complex_single_eigenvectors_2stage_all_kernels_random_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_315 = validate_real_single_eigenvectors_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_316 = validate_real_single_eigenvectors_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_317 = validate_real_single_eigenvectors_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_318 = validate_real_single_eigenvectors_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_319 = validate_real_single_eigenvectors_1stage_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_320 = validate_real_single_eigenvectors_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_321 = validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_322 = validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_323 = validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_324 = validate_real_single_eigenvectors_2stage_all_kernels_random \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_325 = validate_real_single_eigenvectors_2stage_all_kernels_random \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_326 = validate_real_single_eigenvectors_2stage_all_kernels_random_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_327 = validate_complex_double_generalized_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_328 = validate_complex_double_generalized_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_329 = validate_complex_double_generalized_1stage_random_all_layouts_default.sh
+@BUILD_KCOMPUTER_TRUE@am__append_330 = validate_complex_double_generalized_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_331 = validate_complex_double_generalized_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_332 = validate_real_double_generalized_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_333 = validate_real_double_generalized_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_334 = validate_real_double_generalized_1stage_random_all_layouts_default.sh
+@BUILD_KCOMPUTER_TRUE@am__append_335 = validate_real_double_generalized_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_336 = validate_real_double_generalized_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_337 = validate_complex_single_generalized_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_338 = validate_complex_single_generalized_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_339 = validate_complex_single_generalized_1stage_random_all_layouts_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_340 = validate_complex_single_generalized_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_341 = validate_complex_single_generalized_1stage_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_342 = validate_complex_single_generalized_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_343 = validate_real_single_generalized_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_344 = validate_real_single_generalized_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_345 = validate_real_single_generalized_1stage_random_all_layouts_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_346 = validate_real_single_generalized_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_347 = validate_real_single_generalized_1stage_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_348 = validate_real_single_generalized_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_349 = validate_complex_double_generalized_decomp_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_350 = validate_complex_double_generalized_decomp_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_351 = validate_complex_double_generalized_decomp_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_352 = validate_complex_double_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_353 = validate_complex_double_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_354 = validate_real_double_generalized_decomp_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_355 = validate_real_double_generalized_decomp_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_356 = validate_real_double_generalized_decomp_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_357 = validate_real_double_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_358 = validate_real_double_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_359 = validate_complex_single_generalized_decomp_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_360 = validate_complex_single_generalized_decomp_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_361 = validate_complex_single_generalized_decomp_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_362 = validate_complex_single_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_363 = validate_complex_single_generalized_decomp_1stage_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_364 = validate_complex_single_generalized_decomp_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_365 = validate_real_single_generalized_decomp_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_366 = validate_real_single_generalized_decomp_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_367 = validate_real_single_generalized_decomp_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_368 = validate_real_single_generalized_decomp_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_369 = validate_real_single_generalized_decomp_1stage_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_370 = validate_real_single_generalized_decomp_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_371 = validate_complex_double_hermitian_multiply_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_372 = validate_complex_double_hermitian_multiply_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_373 = validate_complex_double_hermitian_multiply_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_374 = validate_complex_double_hermitian_multiply_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_375 = validate_complex_double_hermitian_multiply_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_376 = validate_real_double_hermitian_multiply_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_377 = validate_real_double_hermitian_multiply_1stage_random_all_layouts
+@WITH_MPI_TRUE@am__append_378 = validate_real_double_hermitian_multiply_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_379 = validate_real_double_hermitian_multiply_1stage_random
+@BUILD_KCOMPUTER_FALSE@am__append_380 = validate_real_double_hermitian_multiply_1stage_random
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_381 = validate_complex_single_hermitian_multiply_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_382 = validate_complex_single_hermitian_multiply_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_383 = validate_complex_single_hermitian_multiply_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_384 = validate_complex_single_hermitian_multiply_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_385 = validate_complex_single_hermitian_multiply_1stage_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_386 = validate_complex_single_hermitian_multiply_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_387 = validate_real_single_hermitian_multiply_1stage_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_388 = validate_real_single_hermitian_multiply_1stage_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_389 = validate_real_single_hermitian_multiply_1stage_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_390 = validate_real_single_hermitian_multiply_1stage_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_391 = validate_real_single_hermitian_multiply_1stage_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_392 = validate_real_single_hermitian_multiply_1stage_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_393 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_394 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts
+@WITH_MPI_TRUE@am__append_395 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh \
+@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_396 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random
+@BUILD_KCOMPUTER_FALSE@am__append_397 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_398 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_399 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_400 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_401 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_402 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_403 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_404 = validate_complex_double_cholesky_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_405 = validate_complex_double_cholesky_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_406 = validate_complex_double_cholesky_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_407 = validate_complex_double_cholesky_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_408 = validate_complex_double_cholesky_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_409 = validate_complex_double_cholesky_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_410 = validate_real_double_cholesky_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_411 = validate_real_double_cholesky_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_412 = validate_real_double_cholesky_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_413 = validate_real_double_cholesky_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_414 = validate_real_double_cholesky_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_415 = validate_real_double_cholesky_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_416 = validate_real_double_cholesky_1stage_gpu_random_split_comm_myself
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_417 = validate_real_double_cholesky_1stage_gpu_random_split_comm_myself
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_418 = validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_419 = validate_complex_single_cholesky_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_420 = validate_complex_single_cholesky_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_421 = validate_complex_single_cholesky_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_422 = validate_complex_single_cholesky_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_423 = validate_complex_single_cholesky_1stage_gpu_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_424 = validate_complex_single_cholesky_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_425 = validate_real_single_cholesky_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_426 = validate_real_single_cholesky_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_427 = validate_real_single_cholesky_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_428 = validate_real_single_cholesky_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_429 = validate_real_single_cholesky_1stage_gpu_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_430 = validate_real_single_cholesky_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_431 = validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_432 = validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_433 = validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_434 = validate_complex_double_eigenvectors_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_435 = validate_complex_double_eigenvectors_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_436 = validate_complex_double_eigenvectors_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_437 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_438 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_439 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_440 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_441 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_442 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh \
+@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_443 = validate_real_double_eigenvectors_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_444 = validate_real_double_eigenvectors_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_445 = validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_446 = validate_real_double_eigenvectors_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_447 = validate_real_double_eigenvectors_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_448 = validate_real_double_eigenvectors_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_449 = validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_450 = validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_451 = validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_default.sh \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_452 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_453 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_454 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh \
+@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_455 = validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_456 = validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_457 = validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_458 = validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_459 = validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_460 = validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_461 = validate_complex_single_eigenvectors_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_462 = validate_complex_single_eigenvectors_1stage_gpu_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_463 = validate_complex_single_eigenvectors_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_464 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_465 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_466 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_467 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_468 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_469 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_470 = validate_real_single_eigenvectors_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_471 = validate_real_single_eigenvectors_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_472 = validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_473 = validate_real_single_eigenvectors_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_474 = validate_real_single_eigenvectors_1stage_gpu_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_475 = validate_real_single_eigenvectors_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_476 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_477 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_478 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_479 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_480 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_481 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_482 = validate_complex_double_generalized_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_483 = validate_complex_double_generalized_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_484 = validate_complex_double_generalized_1stage_gpu_random_all_layouts_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_485 = validate_complex_double_generalized_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_486 = validate_complex_double_generalized_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_487 = validate_complex_double_generalized_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_488 = validate_real_double_generalized_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_489 = validate_real_double_generalized_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_490 = validate_real_double_generalized_1stage_gpu_random_all_layouts_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_491 = validate_real_double_generalized_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_492 = validate_real_double_generalized_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_493 = validate_real_double_generalized_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_494 = validate_complex_single_generalized_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_495 = validate_complex_single_generalized_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_496 = validate_complex_single_generalized_1stage_gpu_random_all_layouts_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_497 = validate_complex_single_generalized_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_498 = validate_complex_single_generalized_1stage_gpu_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_499 = validate_complex_single_generalized_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_500 = validate_real_single_generalized_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_501 = validate_real_single_generalized_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_502 = validate_real_single_generalized_1stage_gpu_random_all_layouts_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_503 = validate_real_single_generalized_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_504 = validate_real_single_generalized_1stage_gpu_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_505 = validate_real_single_generalized_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_506 = validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_507 = validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_508 = validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_509 = validate_complex_double_generalized_decomp_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_510 = validate_complex_double_generalized_decomp_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_511 = validate_complex_double_generalized_decomp_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_512 = validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_513 = validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_514 = validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_515 = validate_real_double_generalized_decomp_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_516 = validate_real_double_generalized_decomp_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_517 = validate_real_double_generalized_decomp_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_518 = validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_519 = validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_520 = validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_521 = validate_complex_single_generalized_decomp_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_522 = validate_complex_single_generalized_decomp_1stage_gpu_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_523 = validate_complex_single_generalized_decomp_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_524 = validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_525 = validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_526 = validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_527 = validate_real_single_generalized_decomp_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_528 = validate_real_single_generalized_decomp_1stage_gpu_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_529 = validate_real_single_generalized_decomp_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_530 = validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_531 = validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_532 = validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_533 = validate_complex_double_hermitian_multiply_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_534 = validate_complex_double_hermitian_multiply_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_535 = validate_complex_double_hermitian_multiply_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_536 = validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_537 = validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_538 = validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_539 = validate_real_double_hermitian_multiply_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_540 = validate_real_double_hermitian_multiply_1stage_gpu_random
+@WITH_GPU_VERSION_TRUE@am__append_541 = validate_real_double_hermitian_multiply_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_542 = validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_543 = validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_544 = validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_545 = validate_complex_single_hermitian_multiply_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_546 = validate_complex_single_hermitian_multiply_1stage_gpu_random
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_547 = validate_complex_single_hermitian_multiply_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_548 = validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_549 = validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_550 = validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_551 = validate_real_single_hermitian_multiply_1stage_gpu_random
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_552 = validate_real_single_hermitian_multiply_1stage_gpu_random
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_553 = validate_real_single_hermitian_multiply_1stage_gpu_random_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_554 = validate_complex_double_cholesky_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_555 = validate_complex_double_cholesky_1stage_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_556 = validate_complex_double_cholesky_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_557 = validate_complex_double_cholesky_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_558 = validate_complex_double_cholesky_1stage_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_559 = validate_real_double_cholesky_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_560 = validate_real_double_cholesky_1stage_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_561 = validate_real_double_cholesky_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_562 = validate_real_double_cholesky_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_563 = validate_real_double_cholesky_1stage_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_564 = validate_complex_single_cholesky_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_565 = validate_complex_single_cholesky_1stage_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_566 = validate_complex_single_cholesky_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_567 = validate_complex_single_cholesky_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_568 = validate_complex_single_cholesky_1stage_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_569 = validate_complex_single_cholesky_1stage_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_570 = validate_real_single_cholesky_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_571 = validate_real_single_cholesky_1stage_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_572 = validate_real_single_cholesky_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_573 = validate_real_single_cholesky_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_574 = validate_real_single_cholesky_1stage_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_575 = validate_real_single_cholesky_1stage_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_576 = validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_577 = validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_578 = validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_579 = validate_complex_double_eigenvalues_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_580 = validate_complex_double_eigenvalues_1stage_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_581 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_582 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_583 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_584 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_585 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_586 = validate_real_double_eigenvalues_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_587 = validate_real_double_eigenvalues_1stage_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_588 = validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_589 = validate_real_double_eigenvalues_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_590 = validate_real_double_eigenvalues_1stage_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_591 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_592 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_593 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_594 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_595 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_596 = validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_597 = validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_598 = validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_599 = validate_complex_single_eigenvalues_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_600 = validate_complex_single_eigenvalues_1stage_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_601 = validate_complex_single_eigenvalues_1stage_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_602 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_603 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_604 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_605 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_606 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_607 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_608 = validate_real_single_eigenvalues_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_609 = validate_real_single_eigenvalues_1stage_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_610 = validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_611 = validate_real_single_eigenvalues_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_612 = validate_real_single_eigenvalues_1stage_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_613 = validate_real_single_eigenvalues_1stage_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_614 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_615 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_616 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_617 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_618 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_619 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_620 = validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_621 = validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_622 = validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_623 = validate_complex_double_eigenvectors_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_624 = validate_complex_double_eigenvectors_1stage_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_625 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_626 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_627 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh \
+@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_628 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz \
+@BUILD_KCOMPUTER_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_629 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz \
+@BUILD_KCOMPUTER_FALSE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_630 = validate_real_double_eigenvectors_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_631 = validate_real_double_eigenvectors_1stage_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_632 = validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_633 = validate_real_double_eigenvectors_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_634 = validate_real_double_eigenvectors_1stage_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_635 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_636 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_637 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh \
+@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_638 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_639 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_640 = validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_641 = validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_642 = validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_643 = validate_complex_single_eigenvectors_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_644 = validate_complex_single_eigenvectors_1stage_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_645 = validate_complex_single_eigenvectors_1stage_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_646 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_647 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__append_648 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_649 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_650 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_651 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_652 = validate_real_single_eigenvectors_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_653 = validate_real_single_eigenvectors_1stage_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_654 = validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_655 = validate_real_single_eigenvectors_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_656 = validate_real_single_eigenvectors_1stage_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_657 = validate_real_single_eigenvectors_1stage_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_658 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_659 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_660 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_661 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_662 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_663 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__append_664 = validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__append_665 = validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts
+@WITH_MPI_TRUE@am__append_666 = validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@am__append_667 = validate_real_double_solve_tridiagonal_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@am__append_668 = validate_real_double_solve_tridiagonal_1stage_toeplitz
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_669 = validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_670 = validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__append_671 = validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_672 = validate_real_single_solve_tridiagonal_1stage_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_673 = validate_real_single_solve_tridiagonal_1stage_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_674 = validate_real_single_solve_tridiagonal_1stage_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_675 = validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_676 = validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_677 = validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_678 = validate_complex_double_cholesky_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_679 = validate_complex_double_cholesky_1stage_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_680 = validate_complex_double_cholesky_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_681 = validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_682 = validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_683 = validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_684 = validate_real_double_cholesky_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_685 = validate_real_double_cholesky_1stage_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_686 = validate_real_double_cholesky_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_687 = validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_688 = validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_689 = validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_690 = validate_complex_single_cholesky_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_691 = validate_complex_single_cholesky_1stage_gpu_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_692 = validate_complex_single_cholesky_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_693 = validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_694 = validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_695 = validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_696 = validate_real_single_cholesky_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_697 = validate_real_single_cholesky_1stage_gpu_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_698 = validate_real_single_cholesky_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_699 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_700 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_701 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_702 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_703 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_704 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_705 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_706 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_707 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_708 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_709 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_710 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_711 = validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_712 = validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_713 = validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_714 = validate_real_double_eigenvalues_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_715 = validate_real_double_eigenvalues_1stage_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_716 = validate_real_double_eigenvalues_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_717 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_718 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_719 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_720 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_721 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_722 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_723 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_724 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_725 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_726 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_727 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_728 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_729 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_730 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_731 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_732 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_733 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_734 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_735 = validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_736 = validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_737 = validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_738 = validate_real_single_eigenvalues_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_739 = validate_real_single_eigenvalues_1stage_gpu_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_740 = validate_real_single_eigenvalues_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_741 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_742 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_743 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_744 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_745 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_746 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_747 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_748 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_749 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_750 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_751 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_752 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_753 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_754 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_755 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_756 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_757 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_758 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh \
+@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_759 = validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_760 = validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_761 = validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_762 = validate_real_double_eigenvectors_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_763 = validate_real_double_eigenvectors_1stage_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_764 = validate_real_double_eigenvectors_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_765 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_766 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_767 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_768 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_769 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_770 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh \
+@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_771 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_772 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_773 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_774 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_775 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_776 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_777 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_778 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_779 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_780 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_781 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__append_782 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_783 = validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_784 = validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_785 = validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_786 = validate_real_single_eigenvectors_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_787 = validate_real_single_eigenvectors_1stage_gpu_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_788 = validate_real_single_eigenvectors_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_789 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_790 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_791 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_792 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_793 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_794 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_795 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_796 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_797 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__append_798 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__append_799 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz
+@WITH_GPU_VERSION_TRUE@am__append_800 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_default.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_801 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_802 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__append_803 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_804 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_805 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__append_806 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_default.sh
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am__append_807 = validate_autotune_c_version_complex_double_extended.sh \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@	validate_autotune_c_version_real_double_extended.sh
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am__append_808 = validate_autotune_c_version_complex_double \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@	validate_autotune_c_version_real_double
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_809 = validate_autotune_c_version_complex_single_extended.sh
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_810 = validate_autotune_c_version_complex_single
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_811 = validate_autotune_c_version_real_single_extended.sh
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_812 = validate_autotune_c_version_real_single
+@ENABLE_AUTOTUNING_TRUE@am__append_813 = validate_autotune_complex_double_extended.sh \
+@ENABLE_AUTOTUNING_TRUE@	validate_autotune_real_double_extended.sh
+@ENABLE_AUTOTUNING_TRUE@am__append_814 =  \
+@ENABLE_AUTOTUNING_TRUE@	validate_autotune_complex_double \
+@ENABLE_AUTOTUNING_TRUE@	validate_autotune_real_double
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_815 = validate_autotune_complex_single_extended.sh
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_816 = validate_autotune_complex_single
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_817 = validate_autotune_real_single_extended.sh
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_818 = validate_autotune_real_single
+@ENABLE_AUTOTUNING_TRUE@am__append_819 = validate_multiple_objs_real_double_extended.sh
+@ENABLE_AUTOTUNING_TRUE@am__append_820 = validate_multiple_objs_real_double
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_821 = test_skewsymmetric_real_single_extended.sh
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_822 = test_skewsymmetric_real_single
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am__append_823 = validate_multiple_objs_real_double_c_version_extended.sh
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am__append_824 = validate_multiple_objs_real_double_c_version
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_825 = validate_single_real_2stage_banded@SUFFIX@
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__append_826 = validate_single_real_2stage_banded@SUFFIX@_default.sh
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_827 = validate_single_complex_2stage_banded@SUFFIX@
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__append_828 = validate_single_complex_2stage_banded@SUFFIX@_default.sh
+@WITH_PYTHON_TESTS_TRUE@am__append_829 = test_python.sh
+@WITH_SCALAPACK_TESTS_TRUE@am__append_830 = \
+@WITH_SCALAPACK_TESTS_TRUE@  test/shared/test_scalapack_template.F90
 
-TESTS = $(am__EXEEXT_2)
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_gnu_make.m4 \
+	$(top_srcdir)/m4/ax_ext.m4 \
+	$(top_srcdir)/m4/ax_gcc_x86_avx_xgetbv.m4 \
+	$(top_srcdir)/m4/ax_gcc_x86_cpuid.m4 \
 	$(top_srcdir)/m4/ax_prog_cc_mpi.m4 \
 	$(top_srcdir)/m4/ax_prog_doxygen.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
 	$(top_srcdir)/fdep/fortran_dependencies.m4 \
 	$(top_srcdir)/m4/ax_elpa_openmp.m4 \
 	$(top_srcdir)/m4/ax_prog_fc_mpi.m4 \
-	$(top_srcdir)/m4/ax_elpa_specific_kernels.m4 \
+	$(top_srcdir)/m4/ax_fc_check_define.m4 \
 	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
-	$(am__configure_deps) $(dist_doc_DATA) $(dist_files_DATA) \
-	$(nobase_elpa_include_HEADERS) $(am__DIST_COMMON)
+	$(am__configure_deps) $(am__pyelpa_PYTHON_DIST) \
+	$(dist_doc_DATA) $(nobase_elpa_include_HEADERS) \
+	$(am__DIST_COMMON)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
  configure.lineno config.status.lineno
 mkinstalldirs = $(install_sh) -d
 CONFIG_HEADER = config.h
-CONFIG_CLEAN_FILES = Doxyfile ${PKG_CONFIG_FILE}
+CONFIG_CLEAN_FILES = Doxyfile ${PKG_CONFIG_FILE} elpa/elpa_constants.h \
+	elpa/elpa_version.h elpa/elpa_build_config.h
 CONFIG_CLEAN_VPATH_FILES =
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@am__EXEEXT_1 = validate_c_version_complex_double_eigenvectors_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_2 = validate_c_version_complex_single_eigenvectors_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_3 = validate_c_version_real_single_eigenvectors_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@am__EXEEXT_4 = validate_c_version_complex_double_generalized_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_5 = validate_c_version_complex_single_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_6 = validate_c_version_real_single_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@am__EXEEXT_7 = validate_c_version_complex_double_generalized_decomp_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_8 = validate_c_version_complex_single_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_9 = validate_c_version_real_single_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_10 = validate_c_version_complex_double_eigenvectors_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_11 = validate_c_version_complex_single_eigenvectors_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_12 = validate_c_version_real_single_eigenvectors_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_13 = validate_c_version_complex_double_generalized_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_14 = validate_c_version_complex_single_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_15 = validate_c_version_real_single_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_16 = validate_c_version_complex_double_generalized_decomp_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_17 = validate_c_version_complex_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_18 = validate_c_version_real_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_19 = validate_complex_double_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_20 = validate_complex_double_eigenvectors_1stage_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_21 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_22 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_23 = validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_24 = validate_complex_double_eigenvectors_scalapack_all_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_25 = validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_26 = validate_complex_double_eigenvectors_scalapack_part_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_27 = validate_real_double_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_28 = validate_real_double_eigenvectors_1stage_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_29 = validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_30 = validate_real_double_eigenvectors_2stage_all_kernels_analytic$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_31 = validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_32 = validate_real_double_eigenvectors_scalapack_all_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_33 = validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_34 = validate_real_double_eigenvectors_scalapack_part_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_35 = validate_complex_single_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_36 = validate_complex_single_eigenvectors_1stage_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_37 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_38 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_39 = validate_real_single_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_40 = validate_real_single_eigenvectors_1stage_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_41 = validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_42 = validate_real_single_eigenvectors_2stage_all_kernels_analytic$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_43 = validate_real_double_eigenvalues_1stage_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_44 = validate_real_double_eigenvalues_1stage_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_45 = validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_46 = validate_real_double_eigenvalues_2stage_default_kernel_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_47 = validate_real_double_eigenvectors_1stage_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_48 = validate_real_double_eigenvectors_1stage_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_49 = validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_50 = validate_real_double_eigenvectors_2stage_all_kernels_frank$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_51 = validate_real_double_hermitian_multiply_1stage_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_52 = validate_real_double_hermitian_multiply_1stage_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_53 = validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_54 = validate_real_double_eigenvalues_1stage_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_55 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_56 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_57 = validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_58 = validate_real_double_eigenvectors_1stage_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_59 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_60 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_61 = validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_62 = validate_real_double_hermitian_multiply_1stage_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_63 = validate_complex_double_cholesky_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_64 = validate_complex_double_cholesky_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_65 = validate_real_double_cholesky_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_66 = validate_real_double_cholesky_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_67 = validate_real_double_cholesky_1stage_random_split_comm_myself$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_68 = validate_complex_single_cholesky_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_69 = validate_complex_single_cholesky_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_70 = validate_real_single_cholesky_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_71 = validate_real_single_cholesky_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_72 = validate_complex_double_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_73 = validate_complex_double_eigenvectors_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_74 = validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_75 = validate_complex_double_eigenvectors_2stage_all_kernels_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_76 = validate_real_double_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_77 = validate_real_double_eigenvectors_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_78 = validate_real_double_eigenvectors_1stage_random_split_comm_myself$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_79 = validate_real_double_eigenvectors_2stage_all_kernels_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_80 = validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_81 = validate_complex_single_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_82 = validate_complex_single_eigenvectors_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_83 = validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_84 = validate_complex_single_eigenvectors_2stage_all_kernels_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_85 = validate_real_single_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_86 = validate_real_single_eigenvectors_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_87 = validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_88 = validate_real_single_eigenvectors_2stage_all_kernels_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_89 = validate_complex_double_generalized_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_90 = validate_complex_double_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_91 = validate_real_double_generalized_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_92 = validate_real_double_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_93 = validate_complex_single_generalized_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_94 = validate_complex_single_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_95 = validate_real_single_generalized_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_96 = validate_real_single_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_97 = validate_complex_double_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_98 = validate_complex_double_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_99 = validate_real_double_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_100 = validate_real_double_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_101 = validate_complex_single_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_102 = validate_complex_single_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_103 = validate_real_single_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_104 = validate_real_single_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_105 = validate_complex_double_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_106 = validate_complex_double_hermitian_multiply_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_107 = validate_real_double_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_108 = validate_real_double_hermitian_multiply_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_109 = validate_complex_single_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_110 = validate_complex_single_hermitian_multiply_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_111 = validate_real_single_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_112 = validate_real_single_hermitian_multiply_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_113 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_114 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_115 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_116 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_117 = validate_complex_double_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_118 = validate_complex_double_cholesky_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_119 = validate_real_double_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_120 = validate_real_double_cholesky_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_121 = validate_real_double_cholesky_1stage_gpu_random_split_comm_myself$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_122 = validate_complex_single_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_123 = validate_complex_single_cholesky_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_124 = validate_real_single_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_125 = validate_real_single_cholesky_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_126 = validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_127 = validate_complex_double_eigenvectors_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_128 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_129 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_130 = validate_real_double_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_131 = validate_real_double_eigenvectors_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_132 = validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_133 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_134 = validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_135 = validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_136 = validate_complex_single_eigenvectors_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_137 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_138 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_139 = validate_real_single_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_140 = validate_real_single_eigenvectors_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_141 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_142 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_143 = validate_complex_double_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_144 = validate_complex_double_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_145 = validate_real_double_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_146 = validate_real_double_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_147 = validate_complex_single_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_148 = validate_complex_single_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_149 = validate_real_single_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_150 = validate_real_single_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_151 = validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_152 = validate_complex_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_153 = validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_154 = validate_real_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_155 = validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_156 = validate_complex_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_157 = validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_158 = validate_real_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_159 = validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_160 = validate_complex_double_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_161 = validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_162 = validate_real_double_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_163 = validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_164 = validate_complex_single_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_165 = validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_166 = validate_real_single_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_167 = validate_complex_double_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_168 = validate_complex_double_cholesky_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_169 = validate_real_double_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_170 = validate_real_double_cholesky_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_171 = validate_complex_single_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_172 = validate_complex_single_cholesky_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_173 = validate_real_single_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_174 = validate_real_single_cholesky_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_175 = validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_176 = validate_complex_double_eigenvalues_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_177 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_178 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_179 = validate_real_double_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_180 = validate_real_double_eigenvalues_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_181 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_182 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_183 = validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_184 = validate_complex_single_eigenvalues_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_185 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_186 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_187 = validate_real_single_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_188 = validate_real_single_eigenvalues_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_189 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_190 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_191 = validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_192 = validate_complex_double_eigenvectors_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_193 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_194 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_195 = validate_real_double_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_196 = validate_real_double_eigenvectors_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_197 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_198 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_199 = validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_200 = validate_complex_single_eigenvectors_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_201 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_202 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_203 = validate_real_single_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_204 = validate_real_single_eigenvectors_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_205 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_206 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_MPI_TRUE@am__EXEEXT_207 = validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@am__EXEEXT_208 = validate_real_double_solve_tridiagonal_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_209 = validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_210 = validate_real_single_solve_tridiagonal_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_211 = validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_212 = validate_complex_double_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_213 = validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_214 = validate_real_double_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_215 = validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_216 = validate_complex_single_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_217 = validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_218 = validate_real_single_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_219 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_220 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_221 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_222 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_223 = validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_224 = validate_real_double_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_225 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_226 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_227 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_228 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_229 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_230 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_231 = validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_232 = validate_real_single_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_233 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_234 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_235 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_236 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_237 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_238 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_239 = validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_240 = validate_real_double_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_241 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_242 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_243 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_244 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_245 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_246 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_247 = validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_248 = validate_real_single_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_249 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_250 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_251 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_252 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_253 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_254 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz$(EXEEXT)
+am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(libdir)" \
+	"$(DESTDIR)$(pyelpadir)" "$(DESTDIR)$(pyelpadir)" \
+	"$(DESTDIR)$(man1dir)" "$(DESTDIR)$(man3dir)" \
+	"$(DESTDIR)$(docdir)" "$(DESTDIR)$(pkgconfigdir)" \
+	"$(DESTDIR)$(elpa_includedir)" "$(DESTDIR)$(elpa_includedir)"
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@am__EXEEXT_255 = validate_c_version_complex_double_eigenvectors_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_256 = validate_c_version_complex_single_eigenvectors_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_257 = validate_c_version_real_single_eigenvectors_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@am__EXEEXT_258 = validate_c_version_complex_double_generalized_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_259 = validate_c_version_complex_single_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_260 = validate_c_version_real_single_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@am__EXEEXT_261 = validate_c_version_complex_double_generalized_decomp_1stage_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_262 = validate_c_version_complex_single_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_263 = validate_c_version_real_single_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_264 = validate_c_version_complex_double_eigenvectors_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_265 = validate_c_version_complex_single_eigenvectors_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_266 = validate_c_version_real_single_eigenvectors_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_267 = validate_c_version_complex_double_generalized_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_268 = validate_c_version_complex_single_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_269 = validate_c_version_real_single_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_270 = validate_c_version_complex_double_generalized_decomp_1stage_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	validate_c_version_real_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_271 = validate_c_version_complex_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_272 = validate_c_version_real_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_273 = validate_complex_double_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_274 = validate_complex_double_eigenvectors_1stage_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_275 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_276 = validate_complex_double_eigenvectors_2stage_all_kernels_analytic$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@	validate_complex_double_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_277 = validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_278 = validate_complex_double_eigenvectors_scalapack_all_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_279 = validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_280 = validate_complex_double_eigenvectors_scalapack_part_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_281 = validate_real_double_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_282 = validate_real_double_eigenvectors_1stage_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_283 = validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_284 = validate_real_double_eigenvectors_2stage_all_kernels_analytic$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_285 = validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_286 = validate_real_double_eigenvectors_scalapack_all_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_287 = validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_SCALAPACK_TESTS_TRUE@am__EXEEXT_288 = validate_real_double_eigenvectors_scalapack_part_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_289 = validate_complex_single_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_290 = validate_complex_single_eigenvectors_1stage_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_291 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_292 = validate_complex_single_eigenvectors_2stage_all_kernels_analytic$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_293 = validate_real_single_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_294 = validate_real_single_eigenvectors_1stage_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_295 = validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_296 = validate_real_single_eigenvectors_2stage_all_kernels_analytic$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_297 = validate_real_double_eigenvalues_1stage_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_298 = validate_real_double_eigenvalues_1stage_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_299 = validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_300 = validate_real_double_eigenvalues_2stage_default_kernel_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_301 = validate_real_double_eigenvectors_1stage_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_302 = validate_real_double_eigenvectors_1stage_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_303 = validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_304 = validate_real_double_eigenvectors_2stage_all_kernels_frank$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_305 = validate_real_double_hermitian_multiply_1stage_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_306 = validate_real_double_hermitian_multiply_1stage_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_307 = validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_308 = validate_real_double_eigenvalues_1stage_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_309 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_310 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_311 = validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_312 = validate_real_double_eigenvectors_1stage_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_313 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_314 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_315 = validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_316 = validate_real_double_hermitian_multiply_1stage_gpu_frank$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_317 = validate_complex_double_cholesky_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_318 = validate_complex_double_cholesky_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_319 = validate_real_double_cholesky_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_320 = validate_real_double_cholesky_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_321 = validate_real_double_cholesky_1stage_random_split_comm_myself$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_322 = validate_complex_single_cholesky_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_323 = validate_complex_single_cholesky_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_324 = validate_real_single_cholesky_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_325 = validate_real_single_cholesky_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_326 = validate_complex_double_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_327 = validate_complex_double_eigenvectors_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_328 = validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_329 = validate_complex_double_eigenvectors_2stage_all_kernels_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@	validate_complex_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_330 = validate_real_double_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_331 = validate_real_double_eigenvectors_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_332 = validate_real_double_eigenvectors_1stage_random_split_comm_myself$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_333 = validate_real_double_eigenvectors_2stage_all_kernels_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_334 = validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_335 = validate_complex_single_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_336 = validate_complex_single_eigenvectors_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_337 = validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_338 = validate_complex_single_eigenvectors_2stage_all_kernels_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_339 = validate_real_single_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_340 = validate_real_single_eigenvectors_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_341 = validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_342 = validate_real_single_eigenvectors_2stage_all_kernels_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_343 = validate_complex_double_generalized_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_344 = validate_complex_double_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_345 = validate_real_double_generalized_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_346 = validate_real_double_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_347 = validate_complex_single_generalized_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_348 = validate_complex_single_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_349 = validate_real_single_generalized_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_350 = validate_real_single_generalized_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_351 = validate_complex_double_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_352 = validate_complex_double_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_353 = validate_real_double_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_354 = validate_real_double_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_355 = validate_complex_single_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_356 = validate_complex_single_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_357 = validate_real_single_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_358 = validate_real_single_generalized_decomp_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_359 = validate_complex_double_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_360 = validate_complex_double_hermitian_multiply_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_361 = validate_real_double_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_362 = validate_real_double_hermitian_multiply_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_363 = validate_complex_single_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_364 = validate_complex_single_hermitian_multiply_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_365 = validate_real_single_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_366 = validate_real_single_hermitian_multiply_1stage_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_367 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_368 = validate_real_double_eigenvectors_2stage_all_kernels_qr_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_qr_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_369 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_370 = validate_real_single_eigenvectors_2stage_all_kernels_qr_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_qr_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_371 = validate_complex_double_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_372 = validate_complex_double_cholesky_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_373 = validate_real_double_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_374 = validate_real_double_cholesky_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_375 = validate_real_double_cholesky_1stage_gpu_random_split_comm_myself$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_376 = validate_complex_single_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_377 = validate_complex_single_cholesky_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_378 = validate_real_single_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_379 = validate_real_single_cholesky_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_380 = validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_381 = validate_complex_double_eigenvectors_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_382 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_383 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_384 = validate_real_double_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_385 = validate_real_double_eigenvectors_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_386 = validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_387 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_388 = validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_389 = validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_390 = validate_complex_single_eigenvectors_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_391 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_392 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_393 = validate_real_single_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_394 = validate_real_single_eigenvectors_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_395 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_396 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_397 = validate_complex_double_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_398 = validate_complex_double_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_399 = validate_real_double_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_400 = validate_real_double_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_401 = validate_complex_single_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_402 = validate_complex_single_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_403 = validate_real_single_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_404 = validate_real_single_generalized_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_405 = validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_406 = validate_complex_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_407 = validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_408 = validate_real_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_409 = validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_410 = validate_complex_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_411 = validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_412 = validate_real_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_413 = validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_414 = validate_complex_double_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_415 = validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_416 = validate_real_double_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_417 = validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_418 = validate_complex_single_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_419 = validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_420 = validate_real_single_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_421 = validate_complex_double_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_422 = validate_complex_double_cholesky_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_423 = validate_real_double_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_424 = validate_real_double_cholesky_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_425 = validate_complex_single_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_426 = validate_complex_single_cholesky_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_427 = validate_real_single_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_428 = validate_real_single_cholesky_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_429 = validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_430 = validate_complex_double_eigenvalues_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_431 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_432 = validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_433 = validate_real_double_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_434 = validate_real_double_eigenvalues_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_435 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_436 = validate_real_double_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_437 = validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_438 = validate_complex_single_eigenvalues_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_439 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_440 = validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_441 = validate_real_single_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_442 = validate_real_single_eigenvalues_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_443 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_444 = validate_real_single_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_445 = validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_446 = validate_complex_double_eigenvectors_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_447 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_448 = validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_449 = validate_real_double_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_450 = validate_real_double_eigenvectors_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_451 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_452 = validate_real_double_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_453 = validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_454 = validate_complex_single_eigenvectors_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am__EXEEXT_455 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_456 = validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_457 = validate_real_single_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_458 = validate_real_single_eigenvectors_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_459 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_460 = validate_real_single_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_MPI_TRUE@am__EXEEXT_461 = validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@am__EXEEXT_462 = validate_real_double_solve_tridiagonal_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am__EXEEXT_463 = validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_464 = validate_real_single_solve_tridiagonal_1stage_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_465 = validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_466 = validate_complex_double_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_467 = validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_468 = validate_real_double_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_469 = validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_470 = validate_complex_single_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_471 = validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_472 = validate_real_single_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_473 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_474 = validate_complex_double_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_475 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_476 = validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_477 = validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_478 = validate_real_double_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_479 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_480 = validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_481 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_482 = validate_complex_single_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_483 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_484 = validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_485 = validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_486 = validate_real_single_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_487 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_488 = validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_489 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_490 = validate_complex_double_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_491 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_492 = validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_493 = validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_494 = validate_real_double_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_495 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_496 = validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@	validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_497 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_498 = validate_complex_single_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_499 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_500 = validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_501 = validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_502 = validate_real_single_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_503 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_504 = validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT) \
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_505 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_506 = validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am__EXEEXT_507 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+@BUILD_KCOMPUTER_FALSE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am__EXEEXT_508 = validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz$(EXEEXT)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am__EXEEXT_509 = validate_autotune_c_version_complex_double$(EXEEXT) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@	validate_autotune_c_version_real_double$(EXEEXT)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_510 = validate_autotune_c_version_complex_single$(EXEEXT)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_511 = validate_autotune_c_version_real_single$(EXEEXT)
+@ENABLE_AUTOTUNING_TRUE@am__EXEEXT_512 = validate_autotune_complex_double$(EXEEXT) \
+@ENABLE_AUTOTUNING_TRUE@	validate_autotune_real_double$(EXEEXT)
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_513 = validate_autotune_complex_single$(EXEEXT)
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_514 = validate_autotune_real_single$(EXEEXT)
+@ENABLE_AUTOTUNING_TRUE@am__EXEEXT_515 = validate_multiple_objs_real_double$(EXEEXT)
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_516 = test_skewsymmetric_real_single$(EXEEXT)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am__EXEEXT_517 = validate_multiple_objs_real_double_c_version$(EXEEXT)
+@WANT_SINGLE_PRECISION_REAL_TRUE@am__EXEEXT_518 = validate_single_real_2stage_banded@SUFFIX@$(EXEEXT)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am__EXEEXT_519 = validate_single_complex_2stage_banded@SUFFIX@$(EXEEXT)
+PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS)
 am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
 am__vpath_adj = case $$p in \
     $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
@@ -197,351 +2057,3850 @@
     || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
          $(am__cd) "$$dir" && rm -f $$files; }; \
   }
-am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" \
-	"$(DESTDIR)$(man1dir)" "$(DESTDIR)$(man3dir)" \
-	"$(DESTDIR)$(docdir)" "$(DESTDIR)$(filesdir)" \
-	"$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(elpa_includedir)"
-LTLIBRARIES = $(lib_LTLIBRARIES)
-libelpa@SUFFIX@_la_LIBADD =
-am__libelpa@SUFFIX@_la_SOURCES_DIST = src/mod_precision.f90 \
-	src/mod_mpi.F90 src/mod_mpi_stubs.F90 \
-	src/elpa2_kernels/mod_fortran_interfaces.F90 \
-	src/elpa_utilities.F90 src/elpa1_compute.F90 src/elpa1.F90 \
-	src/elpa2_utilities.F90 src/mod_pack_unpack_real.F90 \
-	src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
-	src/mod_compute_hh_trafo_real.F90 \
-	src/mod_compute_hh_trafo_complex.F90 \
-	src/mod_pack_unpack_complex.F90 src/aligned_mem.F90 \
-	src/elpa2_compute.F90 src/elpa2.F90 src/elpa_c_interface.F90 \
-	src/elpa_qr/qr_utils.F90 src/elpa_qr/elpa_qrkernels.f90 \
-	src/elpa_qr/elpa_pdlarfb.F90 src/elpa_qr/elpa_pdgeqrf.F90 \
-	src/timer.F90 src/ftimings/ftimings.F90 \
-	src/ftimings/ftimings_type.F90 src/ftimings/ftimings_value.F90 \
-	src/ftimings/highwater_mark.c src/ftimings/resident_set_size.c \
-	src/ftimings/time.c src/ftimings/virtual_memory.c \
-	src/ftimings/papi.c src/mod_time_c.F90 \
-	src/elpa2_kernels/elpa2_kernels_real.F90 \
-	src/elpa2_kernels/elpa2_kernels_complex.F90 \
-	src/elpa2_kernels/elpa2_kernels_real_simple.F90 \
-	src/elpa2_kernels/elpa2_kernels_complex_simple.F90 \
-	src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
-	src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
-	src/elpa2_kernels/elpa2_kernels_asm_x86_64.s \
-	src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c \
-	src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \
-	src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c \
-	src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c \
-	src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c \
-	src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c \
-	src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c \
-	src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c \
-	src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c \
-	src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c
+LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES) \
+	$(pyelpa_LTLIBRARIES)
+libelpa@SUFFIX@_la_DEPENDENCIES = libelpa@SUFFIX@_public.la \
+	libelpa@SUFFIX@_private.la
+am_libelpa@SUFFIX@_la_OBJECTS =
+libelpa@SUFFIX@_la_OBJECTS = $(am_libelpa@SUFFIX@_la_OBJECTS)
+libelpa@SUFFIX@_private_la_LIBADD =
+am__libelpa@SUFFIX@_private_la_SOURCES_DIST = src/elpa_impl.F90 \
+	src/elpa_autotune_impl.F90 src/elpa_abstract_impl.F90 \
+	src/helpers/mod_precision.F90 \
+	src/helpers/mod_blas_interfaces.F90 \
+	src/helpers/mod_scalapack_interfaces.F90 \
+	src/helpers/mod_mpi.F90 src/helpers/mod_mpi_stubs.F90 \
+	src/helpers/mod_omp.F90 \
+	src/elpa_generated_fortran_interfaces.F90 \
+	src/elpa2/mod_redist_band.F90 \
+	src/elpa2/mod_pack_unpack_cpu.F90 \
+	src/elpa2/mod_compute_hh_trafo.F90 src/helpers/aligned_mem.F90 \
+	src/elpa1/elpa1_compute_private.F90 \
+	src/elpa1/elpa1_auxiliary.F90 \
+	src/elpa2/elpa2_determine_workload.F90 \
+	src/elpa2/elpa2_compute.F90 \
+	src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
+	src/GPU/check_for_gpu.F90 src/GPU/mod_cuda.F90 \
+	src/elpa2/GPU/interface_c_kernel.F90 \
+	src/elpa2/mod_pack_unpack_gpu.F90 src/elpa2/qr/qr_utils.F90 \
+	src/elpa2/qr/elpa_qrkernels.F90 src/elpa2/qr/elpa_pdlarfb.F90 \
+	src/elpa2/qr/elpa_pdgeqrf.F90 src/elpa1/elpa1.F90 \
+	src/elpa2/elpa2.F90 src/elpa_generalized/cannon.c \
+	src/helpers/matrix_plot.F90 \
+	src/general/mod_elpa_skewsymmetric_blas.F90 src/elpa_index.c \
+	src/elpa_c_interface.c src/general/elpa_utilities.F90 \
+	src/ftimings/ftimings.F90 src/ftimings/ftimings_type.F90 \
+	src/ftimings/ftimings_value.F90 src/ftimings/highwater_mark.c \
+	src/ftimings/resident_set_size.c src/ftimings/time.c \
+	src/ftimings/virtual_memory.c src/ftimings/papi.c \
+	src/helpers/timer_dummy.F90 src/GPU/cudaFunctions.cu \
+	src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu \
+	src/helpers/mod_time_c.F90 src/helpers/get_cpuid_set.c \
+	src/helpers/mod_simd_kernel.F90 src/elpa2/kernels/real.F90 \
+	src/elpa2/kernels/complex.F90 \
+	src/elpa2/kernels/real_simple.F90 \
+	src/elpa2/kernels/complex_simple.F90 \
+	src/elpa2/kernels/real_simple_block4.F90 \
+	src/elpa2/kernels/real_simple_block6.F90 \
+	src/elpa2/kernels/real_bgp.f90 src/elpa2/kernels/real_bgq.f90 \
+	src/elpa2/kernels/asm_x86_64_double_precision.s \
+	src/elpa2/kernels/asm_x86_64_single_precision.s \
+	src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
+	src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \
+	src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
+	src/elpa2/kernels/real_vsx_2hv_double_precision.c \
+	src/elpa2/kernels/real_vsx_2hv_single_precision.c \
+	src/elpa2/kernels/real_sse_2hv_double_precision.c \
+	src/elpa2/kernels/real_sse_2hv_single_precision.c \
+	src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c \
+	src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \
+	src/elpa2/kernels/real_avx512_2hv_double_precision.c \
+	src/elpa2/kernels/real_avx512_2hv_single_precision.c \
+	src/elpa2/kernels/real_sparc64_4hv_double_precision.c \
+	src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c \
+	src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c \
+	src/elpa2/kernels/real_vsx_4hv_double_precision.c \
+	src/elpa2/kernels/real_vsx_4hv_single_precision.c \
+	src/elpa2/kernels/real_sse_4hv_double_precision.c \
+	src/elpa2/kernels/real_sse_4hv_single_precision.c \
+	src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
+	src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
+	src/elpa2/kernels/real_avx512_4hv_double_precision.c \
+	src/elpa2/kernels/real_avx512_4hv_single_precision.c \
+	src/elpa2/kernels/real_sparc64_6hv_double_precision.c \
+	src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c \
+	src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c \
+	src/elpa2/kernels/real_vsx_6hv_double_precision.c \
+	src/elpa2/kernels/real_vsx_6hv_single_precision.c \
+	src/elpa2/kernels/real_sse_6hv_double_precision.c \
+	src/elpa2/kernels/real_sse_6hv_single_precision.c \
+	src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c \
+	src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \
+	src/elpa2/kernels/real_avx512_6hv_double_precision.c \
+	src/elpa2/kernels/real_avx512_6hv_single_precision.c \
+	src/elpa2/kernels/complex_sse_1hv_double_precision.c \
+	src/elpa2/kernels/complex_sse_1hv_single_precision.c \
+	src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c \
+	src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c \
+	src/elpa2/kernels/complex_avx512_1hv_double_precision.c \
+	src/elpa2/kernels/complex_avx512_1hv_single_precision.c \
+	src/elpa2/kernels/complex_sse_2hv_double_precision.c \
+	src/elpa2/kernels/complex_sse_2hv_single_precision.c \
+	src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c \
+	src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \
+	src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
+	src/elpa2/kernels/complex_avx512_2hv_single_precision.c \
+	src/helpers/print_build_config.c
 am__dirstamp = $(am__leading_dot)dirstamp
-@HAVE_DETAILED_TIMINGS_TRUE@am__objects_1 = src/timer.lo \
-@HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/ftimings.lo \
-@HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/ftimings_type.lo \
-@HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/ftimings_value.lo \
+@HAVE_DETAILED_TIMINGS_TRUE@am__objects_1 = src/ftimings/libelpa@SUFFIX@_private_la-ftimings.lo \
+@HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/libelpa@SUFFIX@_private_la-ftimings_type.lo \
+@HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/libelpa@SUFFIX@_private_la-ftimings_value.lo \
 @HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/highwater_mark.lo \
 @HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/resident_set_size.lo \
 @HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/time.lo \
 @HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/virtual_memory.lo \
 @HAVE_DETAILED_TIMINGS_TRUE@	src/ftimings/papi.lo
-@WITH_MPI_FALSE@am__objects_2 = src/mod_time_c.lo
-@HAVE_DETAILED_TIMINGS_FALSE@@WITH_MPI_FALSE@am__objects_3 = src/ftimings/time.lo
-@WITH_REAL_GENERIC_KERNEL_TRUE@am__objects_4 = src/elpa2_kernels/elpa2_kernels_real.lo
-@WITH_COMPLEX_GENERIC_KERNEL_TRUE@am__objects_5 = src/elpa2_kernels/elpa2_kernels_complex.lo
-@WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE@am__objects_6 = src/elpa2_kernels/elpa2_kernels_real_simple.lo
-@WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE@am__objects_7 = src/elpa2_kernels/elpa2_kernels_complex_simple.lo
-@WITH_REAL_BGP_KERNEL_TRUE@am__objects_8 = src/elpa2_kernels/elpa2_kernels_real_bgp.lo
-@WITH_REAL_BGQ_KERNEL_TRUE@am__objects_9 = src/elpa2_kernels/elpa2_kernels_real_bgq.lo
-@WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE@am__objects_10 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo
-@WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE@am__objects_11 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo
-@WITH_REAL_SSE_BLOCK2_KERNEL_TRUE@am__objects_12 = src/elpa2_kernels/elpa2_kernels_real_sse_2hv.lo
-@WITH_REAL_AVX_BLOCK2_KERNEL_TRUE@am__objects_13 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.lo
-@WITH_REAL_SSE_BLOCK4_KERNEL_TRUE@am__objects_14 = src/elpa2_kernels/elpa2_kernels_real_sse_4hv.lo
-@WITH_REAL_AVX_BLOCK4_KERNEL_TRUE@am__objects_15 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.lo
-@WITH_REAL_SSE_BLOCK6_KERNEL_TRUE@am__objects_16 = src/elpa2_kernels/elpa2_kernels_real_sse_6hv.lo
-@WITH_REAL_AVX_BLOCK6_KERNEL_TRUE@am__objects_17 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.lo
-@WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE@am__objects_18 = src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.lo
-@WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE@am__objects_19 = src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.lo
-@WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE@am__objects_20 = src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.lo
-@WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE@am__objects_21 = src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.lo
-am_libelpa@SUFFIX@_la_OBJECTS = src/mod_precision.lo src/mod_mpi.lo \
-	src/mod_mpi_stubs.lo \
-	src/elpa2_kernels/mod_fortran_interfaces.lo \
-	src/elpa_utilities.lo src/elpa1_compute.lo src/elpa1.lo \
-	src/elpa2_utilities.lo src/mod_pack_unpack_real.lo \
-	src/elpa2_kernels/mod_single_hh_trafo_real.lo \
-	src/mod_compute_hh_trafo_real.lo \
-	src/mod_compute_hh_trafo_complex.lo \
-	src/mod_pack_unpack_complex.lo src/aligned_mem.lo \
-	src/elpa2_compute.lo src/elpa2.lo src/elpa_c_interface.lo \
-	src/elpa_qr/qr_utils.lo src/elpa_qr/elpa_qrkernels.lo \
-	src/elpa_qr/elpa_pdlarfb.lo src/elpa_qr/elpa_pdgeqrf.lo \
+@HAVE_DETAILED_TIMINGS_FALSE@am__objects_2 = src/helpers/libelpa@SUFFIX@_private_la-timer_dummy.lo
+@WITH_GPU_VERSION_TRUE@am__objects_3 = src/GPU/cudaFunctions.lo \
+@WITH_GPU_VERSION_TRUE@	src/GPU/cuUtils.lo \
+@WITH_GPU_VERSION_TRUE@	src/elpa2/GPU/ev_tridi_band_gpu_c_v2.lo
+@WITH_MPI_FALSE@am__objects_4 = src/helpers/libelpa@SUFFIX@_private_la-mod_time_c.lo
+@HAVE_DETAILED_TIMINGS_FALSE@@WITH_MPI_FALSE@am__objects_5 = src/ftimings/time.lo
+@HAVE_HETEROGENOUS_CLUSTER_SUPPORT_TRUE@am__objects_6 = src/helpers/get_cpuid_set.lo \
+@HAVE_HETEROGENOUS_CLUSTER_SUPPORT_TRUE@	src/helpers/libelpa@SUFFIX@_private_la-mod_simd_kernel.lo
+@WITH_REAL_GENERIC_KERNEL_TRUE@am__objects_7 = src/elpa2/kernels/libelpa@SUFFIX@_private_la-real.lo
+@WITH_COMPLEX_GENERIC_KERNEL_TRUE@am__objects_8 = src/elpa2/kernels/libelpa@SUFFIX@_private_la-complex.lo
+@WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE@am__objects_9 = src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple.lo
+@WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE@am__objects_10 = src/elpa2/kernels/libelpa@SUFFIX@_private_la-complex_simple.lo
+@WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL_TRUE@am__objects_11 = src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple_block4.lo
+@WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL_TRUE@am__objects_12 = src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple_block6.lo
+@WITH_REAL_BGP_KERNEL_TRUE@am__objects_13 = src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_bgp.lo
+@WITH_REAL_BGQ_KERNEL_TRUE@am__objects_14 = src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_bgq.lo
+@WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE@am__objects_15 = src/elpa2/kernels/asm_x86_64_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE@am__objects_16 = src/elpa2/kernels/asm_x86_64_single_precision.lo
+@WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE@am__objects_17 = src/elpa2/kernels/asm_x86_64_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE@am__objects_18 = src/elpa2/kernels/asm_x86_64_single_precision.lo
+@WITH_REAL_SPARC64_BLOCK2_KERNEL_TRUE@am__objects_19 = src/elpa2/kernels/real_sparc64_2hv_double_precision.lo
+@WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_TRUE@am__objects_20 = src/elpa2/kernels/real_neon_arch64_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL_TRUE@am__objects_21 = src/elpa2/kernels/real_neon_arch64_2hv_single_precision.lo
+@WITH_REAL_VSX_BLOCK2_KERNEL_TRUE@am__objects_22 = src/elpa2/kernels/real_vsx_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_VSX_BLOCK2_KERNEL_TRUE@am__objects_23 = src/elpa2/kernels/real_vsx_2hv_single_precision.lo
+@WITH_REAL_SSE_BLOCK2_KERNEL_TRUE@am__objects_24 = src/elpa2/kernels/real_sse_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_SSE_BLOCK2_KERNEL_TRUE@am__objects_25 = src/elpa2/kernels/real_sse_2hv_single_precision.lo
+@WITH_REAL_AVX_BLOCK2_KERNEL_TRUE@am__objects_26 = src/elpa2/kernels/real_avx-avx2_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX_BLOCK2_KERNEL_TRUE@am__objects_27 = src/elpa2/kernels/real_avx-avx2_2hv_single_precision.lo
+@WITH_REAL_AVX2_BLOCK2_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK2_KERNEL_FALSE@am__objects_28 = src/elpa2/kernels/real_avx-avx2_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX2_BLOCK2_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK2_KERNEL_FALSE@am__objects_29 = src/elpa2/kernels/real_avx-avx2_2hv_single_precision.lo
+@WITH_REAL_AVX512_BLOCK2_KERNEL_TRUE@am__objects_30 = src/elpa2/kernels/real_avx512_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX512_BLOCK2_KERNEL_TRUE@am__objects_31 = src/elpa2/kernels/real_avx512_2hv_single_precision.lo
+@WITH_REAL_SPARC64_BLOCK4_KERNEL_TRUE@am__objects_32 = src/elpa2/kernels/real_sparc64_4hv_double_precision.lo
+@WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_TRUE@am__objects_33 = src/elpa2/kernels/real_neon_arch64_4hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL_TRUE@am__objects_34 = src/elpa2/kernels/real_neon_arch64_4hv_single_precision.lo
+@WITH_REAL_VSX_BLOCK4_KERNEL_TRUE@am__objects_35 = src/elpa2/kernels/real_vsx_4hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_VSX_BLOCK4_KERNEL_TRUE@am__objects_36 = src/elpa2/kernels/real_vsx_4hv_single_precision.lo
+@WITH_REAL_SSE_BLOCK4_KERNEL_TRUE@am__objects_37 = src/elpa2/kernels/real_sse_4hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_SSE_BLOCK4_KERNEL_TRUE@am__objects_38 = src/elpa2/kernels/real_sse_4hv_single_precision.lo
+@WITH_REAL_AVX_BLOCK4_KERNEL_TRUE@am__objects_39 = src/elpa2/kernels/real_avx-avx2_4hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX_BLOCK4_KERNEL_TRUE@am__objects_40 = src/elpa2/kernels/real_avx-avx2_4hv_single_precision.lo
+@WITH_REAL_AVX2_BLOCK4_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK4_KERNEL_FALSE@am__objects_41 = src/elpa2/kernels/real_avx-avx2_4hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX2_BLOCK4_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK4_KERNEL_FALSE@am__objects_42 = src/elpa2/kernels/real_avx-avx2_4hv_single_precision.lo
+@WITH_REAL_AVX512_BLOCK4_KERNEL_TRUE@am__objects_43 = src/elpa2/kernels/real_avx512_4hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX512_BLOCK4_KERNEL_TRUE@am__objects_44 = src/elpa2/kernels/real_avx512_4hv_single_precision.lo
+@WITH_REAL_SPARC64_BLOCK6_KERNEL_TRUE@am__objects_45 = src/elpa2/kernels/real_sparc64_6hv_double_precision.lo
+@WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_TRUE@am__objects_46 = src/elpa2/kernels/real_neon_arch64_6hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL_TRUE@am__objects_47 = src/elpa2/kernels/real_neon_arch64_6hv_single_precision.lo
+@WITH_REAL_VSX_BLOCK6_KERNEL_TRUE@am__objects_48 = src/elpa2/kernels/real_vsx_6hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_VSX_BLOCK6_KERNEL_TRUE@am__objects_49 = src/elpa2/kernels/real_vsx_6hv_single_precision.lo
+@WITH_REAL_SSE_BLOCK6_KERNEL_TRUE@am__objects_50 = src/elpa2/kernels/real_sse_6hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_SSE_BLOCK6_KERNEL_TRUE@am__objects_51 = src/elpa2/kernels/real_sse_6hv_single_precision.lo
+@WITH_REAL_AVX_BLOCK6_KERNEL_TRUE@am__objects_52 = src/elpa2/kernels/real_avx-avx2_6hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX_BLOCK6_KERNEL_TRUE@am__objects_53 = src/elpa2/kernels/real_avx-avx2_6hv_single_precision.lo
+@WITH_REAL_AVX2_BLOCK6_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK6_KERNEL_FALSE@am__objects_54 = src/elpa2/kernels/real_avx-avx2_6hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX2_BLOCK6_KERNEL_TRUE@@WITH_REAL_AVX_BLOCK6_KERNEL_FALSE@am__objects_55 = src/elpa2/kernels/real_avx-avx2_6hv_single_precision.lo
+@WITH_REAL_AVX512_BLOCK6_KERNEL_TRUE@am__objects_56 = src/elpa2/kernels/real_avx512_6hv_double_precision.lo
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_REAL_AVX512_BLOCK6_KERNEL_TRUE@am__objects_57 = src/elpa2/kernels/real_avx512_6hv_single_precision.lo
+@WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE@am__objects_58 = src/elpa2/kernels/complex_sse_1hv_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE@am__objects_59 = src/elpa2/kernels/complex_sse_1hv_single_precision.lo
+@WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE@am__objects_60 = src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE@am__objects_61 = src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.lo
+@WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE@@WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE@am__objects_62 = src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX2_BLOCK1_KERNEL_TRUE@@WITH_COMPLEX_AVX_BLOCK1_KERNEL_FALSE@am__objects_63 = src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.lo
+@WITH_COMPLEX_AVX512_BLOCK1_KERNEL_TRUE@am__objects_64 = src/elpa2/kernels/complex_avx512_1hv_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX512_BLOCK1_KERNEL_TRUE@am__objects_65 = src/elpa2/kernels/complex_avx512_1hv_single_precision.lo
+@WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE@am__objects_66 = src/elpa2/kernels/complex_sse_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE@am__objects_67 = src/elpa2/kernels/complex_sse_2hv_single_precision.lo
+@WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE@am__objects_68 = src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE@am__objects_69 = src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.lo
+@WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE@@WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE@am__objects_70 = src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX2_BLOCK2_KERNEL_TRUE@@WITH_COMPLEX_AVX_BLOCK2_KERNEL_FALSE@am__objects_71 = src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.lo
+@WITH_COMPLEX_AVX512_BLOCK2_KERNEL_TRUE@am__objects_72 = src/elpa2/kernels/complex_avx512_2hv_double_precision.lo
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_COMPLEX_AVX512_BLOCK2_KERNEL_TRUE@am__objects_73 = src/elpa2/kernels/complex_avx512_2hv_single_precision.lo
+@STORE_BUILD_CONFIG_TRUE@am__objects_74 =  \
+@STORE_BUILD_CONFIG_TRUE@	src/helpers/print_build_config.lo
+am_libelpa@SUFFIX@_private_la_OBJECTS =  \
+	src/libelpa@SUFFIX@_private_la-elpa_impl.lo \
+	src/libelpa@SUFFIX@_private_la-elpa_autotune_impl.lo \
+	src/libelpa@SUFFIX@_private_la-elpa_abstract_impl.lo \
+	src/helpers/libelpa@SUFFIX@_private_la-mod_precision.lo \
+	src/helpers/libelpa@SUFFIX@_private_la-mod_blas_interfaces.lo \
+	src/helpers/libelpa@SUFFIX@_private_la-mod_scalapack_interfaces.lo \
+	src/helpers/libelpa@SUFFIX@_private_la-mod_mpi.lo \
+	src/helpers/libelpa@SUFFIX@_private_la-mod_mpi_stubs.lo \
+	src/helpers/libelpa@SUFFIX@_private_la-mod_omp.lo \
+	src/libelpa@SUFFIX@_private_la-elpa_generated_fortran_interfaces.lo \
+	src/elpa2/libelpa@SUFFIX@_private_la-mod_redist_band.lo \
+	src/elpa2/libelpa@SUFFIX@_private_la-mod_pack_unpack_cpu.lo \
+	src/elpa2/libelpa@SUFFIX@_private_la-mod_compute_hh_trafo.lo \
+	src/helpers/libelpa@SUFFIX@_private_la-aligned_mem.lo \
+	src/elpa1/libelpa@SUFFIX@_private_la-elpa1_compute_private.lo \
+	src/elpa1/libelpa@SUFFIX@_private_la-elpa1_auxiliary.lo \
+	src/elpa2/libelpa@SUFFIX@_private_la-elpa2_determine_workload.lo \
+	src/elpa2/libelpa@SUFFIX@_private_la-elpa2_compute.lo \
+	src/elpa2/kernels/libelpa@SUFFIX@_private_la-mod_single_hh_trafo_real.lo \
+	src/GPU/libelpa@SUFFIX@_private_la-check_for_gpu.lo \
+	src/GPU/libelpa@SUFFIX@_private_la-mod_cuda.lo \
+	src/elpa2/GPU/libelpa@SUFFIX@_private_la-interface_c_kernel.lo \
+	src/elpa2/libelpa@SUFFIX@_private_la-mod_pack_unpack_gpu.lo \
+	src/elpa2/qr/libelpa@SUFFIX@_private_la-qr_utils.lo \
+	src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_qrkernels.lo \
+	src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_pdlarfb.lo \
+	src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_pdgeqrf.lo \
+	src/elpa1/libelpa@SUFFIX@_private_la-elpa1.lo \
+	src/elpa2/libelpa@SUFFIX@_private_la-elpa2.lo \
+	src/elpa_generalized/cannon.lo \
+	src/helpers/libelpa@SUFFIX@_private_la-matrix_plot.lo \
+	src/general/libelpa@SUFFIX@_private_la-mod_elpa_skewsymmetric_blas.lo \
+	src/elpa_index.lo src/elpa_c_interface.lo \
+	src/general/libelpa@SUFFIX@_private_la-elpa_utilities.lo \
 	$(am__objects_1) $(am__objects_2) $(am__objects_3) \
 	$(am__objects_4) $(am__objects_5) $(am__objects_6) \
 	$(am__objects_7) $(am__objects_8) $(am__objects_9) \
 	$(am__objects_10) $(am__objects_11) $(am__objects_12) \
 	$(am__objects_13) $(am__objects_14) $(am__objects_15) \
 	$(am__objects_16) $(am__objects_17) $(am__objects_18) \
-	$(am__objects_19) $(am__objects_20) $(am__objects_21)
-libelpa@SUFFIX@_la_OBJECTS = $(am_libelpa@SUFFIX@_la_OBJECTS)
-@WITH_OPENMP_FALSE@am__EXEEXT_1 =  \
-@WITH_OPENMP_FALSE@	elpa1_test_real_c_version@SUFFIX@$(EXEEXT) \
-@WITH_OPENMP_FALSE@	elpa1_test_complex_c_version@SUFFIX@$(EXEEXT) \
-@WITH_OPENMP_FALSE@	elpa2_test_real_c_version@SUFFIX@$(EXEEXT) \
-@WITH_OPENMP_FALSE@	elpa2_test_complex_c_version@SUFFIX@$(EXEEXT)
-PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS)
-am__elpa1_test_complex@SUFFIX@_SOURCES_DIST =  \
-	test/fortran_test_programs/test_complex.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am__objects_22 = test/shared_sources/util.$(OBJEXT) \
-	test/shared_sources/read_input_parameters.$(OBJEXT) \
-	test/shared_sources/check_correctnes.$(OBJEXT) \
-	test/shared_sources/setup_mpi.$(OBJEXT) \
-	test/shared_sources/blacs_infrastructure.$(OBJEXT) \
-	test/shared_sources/prepare_matrix.$(OBJEXT) \
-	test/shared_sources/mod_output_types.$(OBJEXT)
-@HAVE_REDIRECT_TRUE@am__objects_23 =  \
-@HAVE_REDIRECT_TRUE@	test/shared_sources/redir.$(OBJEXT) \
-@HAVE_REDIRECT_TRUE@	test/shared_sources/redirect.$(OBJEXT)
-am_elpa1_test_complex@SUFFIX@_OBJECTS =  \
-	test/fortran_test_programs/test_complex.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa1_test_complex@SUFFIX@_OBJECTS =  \
-	$(am_elpa1_test_complex@SUFFIX@_OBJECTS)
-elpa1_test_complex@SUFFIX@_DEPENDENCIES = $(build_lib)
+	$(am__objects_19) $(am__objects_20) $(am__objects_21) \
+	$(am__objects_22) $(am__objects_23) $(am__objects_24) \
+	$(am__objects_25) $(am__objects_26) $(am__objects_27) \
+	$(am__objects_28) $(am__objects_29) $(am__objects_30) \
+	$(am__objects_31) $(am__objects_32) $(am__objects_33) \
+	$(am__objects_34) $(am__objects_35) $(am__objects_36) \
+	$(am__objects_37) $(am__objects_38) $(am__objects_39) \
+	$(am__objects_40) $(am__objects_41) $(am__objects_42) \
+	$(am__objects_43) $(am__objects_44) $(am__objects_45) \
+	$(am__objects_46) $(am__objects_47) $(am__objects_48) \
+	$(am__objects_49) $(am__objects_50) $(am__objects_51) \
+	$(am__objects_52) $(am__objects_53) $(am__objects_54) \
+	$(am__objects_55) $(am__objects_56) $(am__objects_57) \
+	$(am__objects_58) $(am__objects_59) $(am__objects_60) \
+	$(am__objects_61) $(am__objects_62) $(am__objects_63) \
+	$(am__objects_64) $(am__objects_65) $(am__objects_66) \
+	$(am__objects_67) $(am__objects_68) $(am__objects_69) \
+	$(am__objects_70) $(am__objects_71) $(am__objects_72) \
+	$(am__objects_73) $(am__objects_74)
+libelpa@SUFFIX@_private_la_OBJECTS =  \
+	$(am_libelpa@SUFFIX@_private_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
 am__v_lt_0 = --silent
 am__v_lt_1 = 
-am__elpa1_test_complex_c_version@SUFFIX@_SOURCES_DIST =  \
-	test/c_test_programs/elpa1_test_complex_c_version.c \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-@WITH_OPENMP_FALSE@am_elpa1_test_complex_c_version@SUFFIX@_OBJECTS = test/c_test_programs/elpa1_test_complex_c_version.$(OBJEXT) \
-@WITH_OPENMP_FALSE@	$(am__objects_22) $(am__objects_23)
-elpa1_test_complex_c_version@SUFFIX@_OBJECTS =  \
-	$(am_elpa1_test_complex_c_version@SUFFIX@_OBJECTS)
-@WITH_OPENMP_FALSE@elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES =  \
-@WITH_OPENMP_FALSE@	$(build_lib)
-am__elpa1_test_real@SUFFIX@_SOURCES_DIST =  \
-	test/fortran_test_programs/test_real.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa1_test_real@SUFFIX@_OBJECTS =  \
-	test/fortran_test_programs/test_real.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa1_test_real@SUFFIX@_OBJECTS =  \
-	$(am_elpa1_test_real@SUFFIX@_OBJECTS)
-elpa1_test_real@SUFFIX@_DEPENDENCIES = $(build_lib)
-am__elpa1_test_real_c_version@SUFFIX@_SOURCES_DIST =  \
-	test/c_test_programs/elpa1_test_real_c_version.c \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-@WITH_OPENMP_FALSE@am_elpa1_test_real_c_version@SUFFIX@_OBJECTS = test/c_test_programs/elpa1_test_real_c_version.$(OBJEXT) \
-@WITH_OPENMP_FALSE@	$(am__objects_22) $(am__objects_23)
-elpa1_test_real_c_version@SUFFIX@_OBJECTS =  \
-	$(am_elpa1_test_real_c_version@SUFFIX@_OBJECTS)
-@WITH_OPENMP_FALSE@elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES =  \
-@WITH_OPENMP_FALSE@	$(build_lib)
-am__elpa1_test_real_with_c@SUFFIX@_SOURCES_DIST =  \
-	test/fortran_test_programs/test_real_with_c.F90 \
-	test/shared_sources/mod_from_c.F90 \
-	test/shared_sources/call_elpa1.c test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa1_test_real_with_c@SUFFIX@_OBJECTS =  \
-	test/fortran_test_programs/test_real_with_c.$(OBJEXT) \
-	test/shared_sources/mod_from_c.$(OBJEXT) \
-	test/shared_sources/call_elpa1.$(OBJEXT) $(am__objects_22) \
-	$(am__objects_23)
-elpa1_test_real_with_c@SUFFIX@_OBJECTS =  \
-	$(am_elpa1_test_real_with_c@SUFFIX@_OBJECTS)
-elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES = $(build_lib)
-am__elpa2_print_kernels@SUFFIX@_SOURCES_DIST =  \
-	src/elpa2_print_kernels.F90 test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa2_print_kernels@SUFFIX@_OBJECTS =  \
-	src/elpa2_print_kernels.$(OBJEXT) $(am__objects_22) \
-	$(am__objects_23)
+libelpa@SUFFIX@_private_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+libelpa@SUFFIX@_public_la_LIBADD =
+am_libelpa@SUFFIX@_public_la_OBJECTS =  \
+	src/libelpa@SUFFIX@_public_la-elpa.lo \
+	src/libelpa@SUFFIX@_public_la-elpa_api.lo \
+	src/libelpa@SUFFIX@_public_la-elpa_constants.lo
+libelpa@SUFFIX@_public_la_OBJECTS =  \
+	$(am_libelpa@SUFFIX@_public_la_OBJECTS)
+libelpa@SUFFIX@_public_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(libelpa@SUFFIX@_public_la_FCFLAGS) $(FCFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+libelpatest@SUFFIX@_la_LIBADD =
+am__libelpatest@SUFFIX@_la_SOURCES_DIST =  \
+	test/shared/tests_variable_definitions.F90 \
+	test/shared/mod_tests_scalapack_interfaces.F90 \
+	test/shared/mod_tests_blas_interfaces.F90 \
+	test/shared/test_util.F90 \
+	test/shared/test_read_input_parameters.F90 \
+	test/shared/test_check_correctness.F90 \
+	test/shared/test_setup_mpi.F90 \
+	test/shared/test_blacs_infrastructure.F90 \
+	test/shared/test_prepare_matrix.F90 \
+	test/shared/test_analytic.F90 test/shared/test_output_type.F90 \
+	test/shared/test_scalapack.F90 test/shared/test_redir.c \
+	test/shared/test_redirect.F90
+@WITH_SCALAPACK_TESTS_TRUE@am__objects_75 = test/shared/libelpatest@SUFFIX@_la-test_scalapack.lo
+@HAVE_REDIRECT_TRUE@am__objects_76 = test/shared/test_redir.lo \
+@HAVE_REDIRECT_TRUE@	test/shared/libelpatest@SUFFIX@_la-test_redirect.lo
+am_libelpatest@SUFFIX@_la_OBJECTS = test/shared/libelpatest@SUFFIX@_la-tests_variable_definitions.lo \
+	test/shared/libelpatest@SUFFIX@_la-mod_tests_scalapack_interfaces.lo \
+	test/shared/libelpatest@SUFFIX@_la-mod_tests_blas_interfaces.lo \
+	test/shared/libelpatest@SUFFIX@_la-test_util.lo \
+	test/shared/libelpatest@SUFFIX@_la-test_read_input_parameters.lo \
+	test/shared/libelpatest@SUFFIX@_la-test_check_correctness.lo \
+	test/shared/libelpatest@SUFFIX@_la-test_setup_mpi.lo \
+	test/shared/libelpatest@SUFFIX@_la-test_blacs_infrastructure.lo \
+	test/shared/libelpatest@SUFFIX@_la-test_prepare_matrix.lo \
+	test/shared/libelpatest@SUFFIX@_la-test_analytic.lo \
+	test/shared/libelpatest@SUFFIX@_la-test_output_type.lo \
+	$(am__objects_75) $(am__objects_76)
+libelpatest@SUFFIX@_la_OBJECTS = $(am_libelpatest@SUFFIX@_la_OBJECTS)
+libelpatest@SUFFIX@_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+wrapper_la_DEPENDENCIES = libelpa@SUFFIX@.la
+nodist_wrapper_la_OBJECTS = python/pyelpa/wrapper_la-wrapper.lo
+wrapper_la_OBJECTS = $(nodist_wrapper_la_OBJECTS)
+wrapper_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(wrapper_la_CFLAGS) \
+	$(CFLAGS) $(wrapper_la_LDFLAGS) $(LDFLAGS) -o $@
+@WITH_PYTHON_TRUE@am_wrapper_la_rpath = -rpath $(pyelpadir)
+am_elpa2_print_kernels@SUFFIX@_OBJECTS = src/elpa2/elpa2_print_kernels@SUFFIX@-elpa2_print_kernels.$(OBJEXT)
 elpa2_print_kernels@SUFFIX@_OBJECTS =  \
 	$(am_elpa2_print_kernels@SUFFIX@_OBJECTS)
-elpa2_print_kernels@SUFFIX@_DEPENDENCIES = $(build_lib)
-am__elpa2_test_complex@SUFFIX@_SOURCES_DIST =  \
-	test/fortran_test_programs/test_complex2.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa2_test_complex@SUFFIX@_OBJECTS =  \
-	test/fortran_test_programs/test_complex2.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa2_test_complex@SUFFIX@_OBJECTS =  \
-	$(am_elpa2_test_complex@SUFFIX@_OBJECTS)
-elpa2_test_complex@SUFFIX@_DEPENDENCIES = $(build_lib)
-am__elpa2_test_complex_c_version@SUFFIX@_SOURCES_DIST =  \
-	test/c_test_programs/elpa2_test_complex_c_version.c \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-@WITH_OPENMP_FALSE@am_elpa2_test_complex_c_version@SUFFIX@_OBJECTS = test/c_test_programs/elpa2_test_complex_c_version.$(OBJEXT) \
-@WITH_OPENMP_FALSE@	$(am__objects_22) $(am__objects_23)
-elpa2_test_complex_c_version@SUFFIX@_OBJECTS =  \
-	$(am_elpa2_test_complex_c_version@SUFFIX@_OBJECTS)
-@WITH_OPENMP_FALSE@elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES =  \
-@WITH_OPENMP_FALSE@	$(build_lib)
-am__elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES_DIST = test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS = test/fortran_test_programs/test_complex2_choose_kernel_with_api.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS = $(am_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS)
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES =  \
-	$(build_lib)
-am__elpa2_test_complex_default_kernel@SUFFIX@_SOURCES_DIST =  \
-	test/fortran_test_programs/test_complex2_default_kernel.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS = test/fortran_test_programs/test_complex2_default_kernel.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS =  \
-	$(am_elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS)
-elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES = $(build_lib)
-am__elpa2_test_real@SUFFIX@_SOURCES_DIST =  \
-	test/fortran_test_programs/test_real2.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa2_test_real@SUFFIX@_OBJECTS =  \
-	test/fortran_test_programs/test_real2.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa2_test_real@SUFFIX@_OBJECTS =  \
-	$(am_elpa2_test_real@SUFFIX@_OBJECTS)
-elpa2_test_real@SUFFIX@_DEPENDENCIES = $(build_lib)
-am__elpa2_test_real_c_version@SUFFIX@_SOURCES_DIST =  \
-	test/c_test_programs/elpa2_test_real_c_version.c \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-@WITH_OPENMP_FALSE@am_elpa2_test_real_c_version@SUFFIX@_OBJECTS = test/c_test_programs/elpa2_test_real_c_version.$(OBJEXT) \
-@WITH_OPENMP_FALSE@	$(am__objects_22) $(am__objects_23)
-elpa2_test_real_c_version@SUFFIX@_OBJECTS =  \
-	$(am_elpa2_test_real_c_version@SUFFIX@_OBJECTS)
-@WITH_OPENMP_FALSE@elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES =  \
-@WITH_OPENMP_FALSE@	$(build_lib)
-am__elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES_DIST = test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS = test/fortran_test_programs/test_real2_choose_kernel_with_api.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS =  \
-	$(am_elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS)
-elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES =  \
-	$(build_lib)
-am__elpa2_test_real_default_kernel@SUFFIX@_SOURCES_DIST =  \
-	test/fortran_test_programs/test_real2_default_kernel.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa2_test_real_default_kernel@SUFFIX@_OBJECTS = test/fortran_test_programs/test_real2_default_kernel.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa2_test_real_default_kernel@SUFFIX@_OBJECTS =  \
-	$(am_elpa2_test_real_default_kernel@SUFFIX@_OBJECTS)
-elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES = $(build_lib)
-am__elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES_DIST = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \
-	test/shared_sources/util.F90 \
-	test/shared_sources/read_input_parameters.F90 \
-	test/shared_sources/check_correctnes.F90 \
-	test/shared_sources/setup_mpi.F90 \
-	test/shared_sources/blacs_infrastructure.F90 \
-	test/shared_sources/prepare_matrix.F90 \
-	test/shared_sources/mod_output_types.F90 \
-	test/shared_sources/redir.c test/shared_sources/redirect.F90
-am_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.$(OBJEXT) \
-	$(am__objects_22) $(am__objects_23)
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS = $(am_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS)
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES =  \
-	$(build_lib)
+elpa2_print_kernels@SUFFIX@_DEPENDENCIES = libelpa@SUFFIX@.la
+elpa2_print_kernels@SUFFIX@_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(elpa2_print_kernels@SUFFIX@_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_test_skewsymmetric_real_double_OBJECTS = test/Fortran/skewsymmetric_real_double-test_skewsymmetric.$(OBJEXT)
+test_skewsymmetric_real_double_OBJECTS =  \
+	$(am_test_skewsymmetric_real_double_OBJECTS)
+test_skewsymmetric_real_double_DEPENDENCIES = $(test_program_ldadd)
+test_skewsymmetric_real_double_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(test_skewsymmetric_real_double_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__test_skewsymmetric_real_single_SOURCES_DIST =  \
+	test/Fortran/test_skewsymmetric.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_test_skewsymmetric_real_single_OBJECTS = test/Fortran/skewsymmetric_real_single-test_skewsymmetric.$(OBJEXT)
+test_skewsymmetric_real_single_OBJECTS =  \
+	$(am_test_skewsymmetric_real_single_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@test_skewsymmetric_real_single_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+test_skewsymmetric_real_single_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(test_skewsymmetric_real_single_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_autotune_c_version_complex_double_SOURCES_DIST =  \
+	test/C/test_autotune.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am_validate_autotune_c_version_complex_double_OBJECTS = test/C/validate_autotune_c_version_complex_double-test_autotune.$(OBJEXT)
+validate_autotune_c_version_complex_double_OBJECTS =  \
+	$(am_validate_autotune_c_version_complex_double_OBJECTS)
+am__DEPENDENCIES_1 =
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_autotune_c_version_complex_double_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_autotune_c_version_complex_double_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) \
+	$(validate_autotune_c_version_complex_double_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_autotune_c_version_complex_single_SOURCES_DIST =  \
+	test/C/test_autotune.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_autotune_c_version_complex_single_OBJECTS = test/C/validate_autotune_c_version_complex_single-test_autotune.$(OBJEXT)
+validate_autotune_c_version_complex_single_OBJECTS =  \
+	$(am_validate_autotune_c_version_complex_single_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_autotune_c_version_complex_single_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(am__DEPENDENCIES_1)
+validate_autotune_c_version_complex_single_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) \
+	$(validate_autotune_c_version_complex_single_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_autotune_c_version_real_double_SOURCES_DIST =  \
+	test/C/test_autotune.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am_validate_autotune_c_version_real_double_OBJECTS = test/C/validate_autotune_c_version_real_double-test_autotune.$(OBJEXT)
+validate_autotune_c_version_real_double_OBJECTS =  \
+	$(am_validate_autotune_c_version_real_double_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_autotune_c_version_real_double_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_autotune_c_version_real_double_LINK = $(LIBTOOL) $(AM_V_lt) \
+	--tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
+	$(CCLD) $(validate_autotune_c_version_real_double_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_autotune_c_version_real_single_SOURCES_DIST =  \
+	test/C/test_autotune.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_autotune_c_version_real_single_OBJECTS = test/C/validate_autotune_c_version_real_single-test_autotune.$(OBJEXT)
+validate_autotune_c_version_real_single_OBJECTS =  \
+	$(am_validate_autotune_c_version_real_single_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_autotune_c_version_real_single_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	$(am__DEPENDENCIES_1)
+validate_autotune_c_version_real_single_LINK = $(LIBTOOL) $(AM_V_lt) \
+	--tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
+	$(CCLD) $(validate_autotune_c_version_real_single_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_autotune_complex_double_SOURCES_DIST =  \
+	test/Fortran/test_autotune.F90
+@ENABLE_AUTOTUNING_TRUE@am_validate_autotune_complex_double_OBJECTS = test/Fortran/validate_autotune_complex_double-test_autotune.$(OBJEXT)
+validate_autotune_complex_double_OBJECTS =  \
+	$(am_validate_autotune_complex_double_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@validate_autotune_complex_double_DEPENDENCIES =  \
+@ENABLE_AUTOTUNING_TRUE@	$(test_program_ldadd)
+validate_autotune_complex_double_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_autotune_complex_double_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_autotune_complex_single_SOURCES_DIST =  \
+	test/Fortran/test_autotune.F90
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_autotune_complex_single_OBJECTS = test/Fortran/validate_autotune_complex_single-test_autotune.$(OBJEXT)
+validate_autotune_complex_single_OBJECTS =  \
+	$(am_validate_autotune_complex_single_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_autotune_complex_single_DEPENDENCIES = $(test_program_ldadd)
+validate_autotune_complex_single_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_autotune_complex_single_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_autotune_real_double_SOURCES_DIST =  \
+	test/Fortran/test_autotune.F90
+@ENABLE_AUTOTUNING_TRUE@am_validate_autotune_real_double_OBJECTS = test/Fortran/validate_autotune_real_double-test_autotune.$(OBJEXT)
+validate_autotune_real_double_OBJECTS =  \
+	$(am_validate_autotune_real_double_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@validate_autotune_real_double_DEPENDENCIES =  \
+@ENABLE_AUTOTUNING_TRUE@	$(test_program_ldadd)
+validate_autotune_real_double_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_autotune_real_double_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_autotune_real_single_SOURCES_DIST =  \
+	test/Fortran/test_autotune.F90
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_autotune_real_single_OBJECTS = test/Fortran/validate_autotune_real_single-test_autotune.$(OBJEXT)
+validate_autotune_real_single_OBJECTS =  \
+	$(am_validate_autotune_real_single_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_autotune_real_single_DEPENDENCIES = $(test_program_ldadd)
+validate_autotune_real_single_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_autotune_real_single_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_double_eigenvectors_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_complex_double_eigenvectors_1stage_gpu_random_OBJECTS = test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_complex_double_eigenvectors_1stage_gpu_random_OBJECTS = $(am_validate_c_version_complex_double_eigenvectors_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_eigenvectors_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_double_eigenvectors_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_double_eigenvectors_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@am_validate_c_version_complex_double_eigenvectors_1stage_random_OBJECTS = test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.$(OBJEXT)
+validate_c_version_complex_double_eigenvectors_1stage_random_OBJECTS = $(am_validate_c_version_complex_double_eigenvectors_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_eigenvectors_1stage_random_DEPENDENCIES =  \
+@ENABLE_C_TESTS_TRUE@	$(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_double_eigenvectors_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_double_eigenvectors_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT)
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = $(am_validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@am_validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS = test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT)
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS = $(am_validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES =  \
+@ENABLE_C_TESTS_TRUE@	$(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_double_generalized_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_complex_double_generalized_1stage_gpu_random_OBJECTS = test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_complex_double_generalized_1stage_gpu_random_OBJECTS = $(am_validate_c_version_complex_double_generalized_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_generalized_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_double_generalized_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_double_generalized_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_double_generalized_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@am_validate_c_version_complex_double_generalized_1stage_random_OBJECTS = test/C/validate_c_version_complex_double_generalized_1stage_random-test.$(OBJEXT)
+validate_c_version_complex_double_generalized_1stage_random_OBJECTS = $(am_validate_c_version_complex_double_generalized_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_generalized_1stage_random_DEPENDENCIES =  \
+@ENABLE_C_TESTS_TRUE@	$(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_double_generalized_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_double_generalized_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS = test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS = $(am_validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_double_generalized_decomp_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@am_validate_c_version_complex_double_generalized_decomp_1stage_random_OBJECTS = test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.$(OBJEXT)
+validate_c_version_complex_double_generalized_decomp_1stage_random_OBJECTS = $(am_validate_c_version_complex_double_generalized_decomp_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_generalized_decomp_1stage_random_DEPENDENCIES =  \
+@ENABLE_C_TESTS_TRUE@	$(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_double_generalized_decomp_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_double_generalized_decomp_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_single_eigenvectors_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_complex_single_eigenvectors_1stage_gpu_random_OBJECTS = test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_complex_single_eigenvectors_1stage_gpu_random_OBJECTS = $(am_validate_c_version_complex_single_eigenvectors_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_eigenvectors_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_single_eigenvectors_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_single_eigenvectors_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_c_version_complex_single_eigenvectors_1stage_random_OBJECTS = test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.$(OBJEXT)
+validate_c_version_complex_single_eigenvectors_1stage_random_OBJECTS = $(am_validate_c_version_complex_single_eigenvectors_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_eigenvectors_1stage_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_single_eigenvectors_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_single_eigenvectors_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT)
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = $(am_validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS = test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT)
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS = $(am_validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_single_generalized_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_complex_single_generalized_1stage_gpu_random_OBJECTS = test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_complex_single_generalized_1stage_gpu_random_OBJECTS = $(am_validate_c_version_complex_single_generalized_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_generalized_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_single_generalized_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_single_generalized_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_single_generalized_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_c_version_complex_single_generalized_1stage_random_OBJECTS = test/C/validate_c_version_complex_single_generalized_1stage_random-test.$(OBJEXT)
+validate_c_version_complex_single_generalized_1stage_random_OBJECTS = $(am_validate_c_version_complex_single_generalized_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_generalized_1stage_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_single_generalized_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_single_generalized_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS = test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS = $(am_validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_complex_single_generalized_decomp_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_c_version_complex_single_generalized_decomp_1stage_random_OBJECTS = test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.$(OBJEXT)
+validate_c_version_complex_single_generalized_decomp_1stage_random_OBJECTS = $(am_validate_c_version_complex_single_generalized_decomp_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_generalized_decomp_1stage_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_complex_single_generalized_decomp_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_complex_single_generalized_decomp_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_double_eigenvectors_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_real_double_eigenvectors_1stage_gpu_random_OBJECTS = test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_real_double_eigenvectors_1stage_gpu_random_OBJECTS = $(am_validate_c_version_real_double_eigenvectors_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_eigenvectors_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_double_eigenvectors_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_double_eigenvectors_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_double_eigenvectors_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@am_validate_c_version_real_double_eigenvectors_1stage_random_OBJECTS = test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.$(OBJEXT)
+validate_c_version_real_double_eigenvectors_1stage_random_OBJECTS = $(am_validate_c_version_real_double_eigenvectors_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_eigenvectors_1stage_random_DEPENDENCIES =  \
+@ENABLE_C_TESTS_TRUE@	$(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_double_eigenvectors_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_double_eigenvectors_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT)
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = $(am_validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@am_validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS = test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT)
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS = $(am_validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES =  \
+@ENABLE_C_TESTS_TRUE@	$(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_double_generalized_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_real_double_generalized_1stage_gpu_random_OBJECTS = test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_real_double_generalized_1stage_gpu_random_OBJECTS = $(am_validate_c_version_real_double_generalized_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_generalized_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_double_generalized_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_double_generalized_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_double_generalized_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@am_validate_c_version_real_double_generalized_1stage_random_OBJECTS = test/C/validate_c_version_real_double_generalized_1stage_random-test.$(OBJEXT)
+validate_c_version_real_double_generalized_1stage_random_OBJECTS = $(am_validate_c_version_real_double_generalized_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_generalized_1stage_random_DEPENDENCIES =  \
+@ENABLE_C_TESTS_TRUE@	$(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_double_generalized_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_double_generalized_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_double_generalized_decomp_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_real_double_generalized_decomp_1stage_gpu_random_OBJECTS = test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_real_double_generalized_decomp_1stage_gpu_random_OBJECTS = $(am_validate_c_version_real_double_generalized_decomp_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_double_generalized_decomp_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_double_generalized_decomp_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@am_validate_c_version_real_double_generalized_decomp_1stage_random_OBJECTS = test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.$(OBJEXT)
+validate_c_version_real_double_generalized_decomp_1stage_random_OBJECTS = $(am_validate_c_version_real_double_generalized_decomp_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_generalized_decomp_1stage_random_DEPENDENCIES =  \
+@ENABLE_C_TESTS_TRUE@	$(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_double_generalized_decomp_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_double_generalized_decomp_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_single_eigenvectors_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_real_single_eigenvectors_1stage_gpu_random_OBJECTS = test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_real_single_eigenvectors_1stage_gpu_random_OBJECTS = $(am_validate_c_version_real_single_eigenvectors_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_eigenvectors_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_single_eigenvectors_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_single_eigenvectors_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_single_eigenvectors_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_c_version_real_single_eigenvectors_1stage_random_OBJECTS = test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.$(OBJEXT)
+validate_c_version_real_single_eigenvectors_1stage_random_OBJECTS = $(am_validate_c_version_real_single_eigenvectors_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_eigenvectors_1stage_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_single_eigenvectors_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_single_eigenvectors_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT)
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = $(am_validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS = test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT)
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS = $(am_validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_single_generalized_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_real_single_generalized_1stage_gpu_random_OBJECTS = test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_real_single_generalized_1stage_gpu_random_OBJECTS = $(am_validate_c_version_real_single_generalized_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_generalized_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_single_generalized_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_single_generalized_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_single_generalized_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_c_version_real_single_generalized_1stage_random_OBJECTS = test/C/validate_c_version_real_single_generalized_1stage_random-test.$(OBJEXT)
+validate_c_version_real_single_generalized_1stage_random_OBJECTS = $(am_validate_c_version_real_single_generalized_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_generalized_1stage_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_single_generalized_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_single_generalized_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_single_generalized_decomp_1stage_gpu_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_c_version_real_single_generalized_decomp_1stage_gpu_random_OBJECTS = test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.$(OBJEXT)
+validate_c_version_real_single_generalized_decomp_1stage_gpu_random_OBJECTS = $(am_validate_c_version_real_single_generalized_decomp_1stage_gpu_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_single_generalized_decomp_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_c_version_real_single_generalized_decomp_1stage_random_SOURCES_DIST =  \
+	test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_c_version_real_single_generalized_decomp_1stage_random_OBJECTS = test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.$(OBJEXT)
+validate_c_version_real_single_generalized_decomp_1stage_random_OBJECTS = $(am_validate_c_version_real_single_generalized_decomp_1stage_random_OBJECTS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_generalized_decomp_1stage_random_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@	$(am__DEPENDENCIES_1)
+validate_c_version_real_single_generalized_decomp_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(validate_c_version_real_single_generalized_decomp_1stage_random_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_2stage_banded@SUFFIX@_OBJECTS = test/Fortran/elpa2/validate_complex_2stage_banded@SUFFIX@-complex_2stage_banded.$(OBJEXT)
+validate_complex_2stage_banded@SUFFIX@_OBJECTS =  \
+	$(am_validate_complex_2stage_banded@SUFFIX@_OBJECTS)
+validate_complex_2stage_banded@SUFFIX@_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_2stage_banded@SUFFIX@_LINK = $(LIBTOOL) $(AM_V_lt) \
+	--tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
+	$(FCLD) $(validate_complex_2stage_banded@SUFFIX@_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_cholesky_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_cholesky_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_double_cholesky_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_double_cholesky_1stage_gpu_random_OBJECTS = $(am_validate_complex_double_cholesky_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_cholesky_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_cholesky_1stage_gpu_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_double_cholesky_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_cholesky_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_cholesky_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_cholesky_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_cholesky_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_cholesky_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_double_cholesky_1stage_gpu_toeplitz_OBJECTS = $(am_validate_complex_double_cholesky_1stage_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_cholesky_1stage_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_cholesky_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_cholesky_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_cholesky_1stage_random_OBJECTS = test/Fortran/validate_complex_double_cholesky_1stage_random-test.$(OBJEXT)
+validate_complex_double_cholesky_1stage_random_OBJECTS =  \
+	$(am_validate_complex_double_cholesky_1stage_random_OBJECTS)
+validate_complex_double_cholesky_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_cholesky_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_double_cholesky_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_cholesky_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_cholesky_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_cholesky_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_cholesky_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_double_cholesky_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_cholesky_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_cholesky_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_cholesky_1stage_toeplitz_OBJECTS = test/Fortran/validate_complex_double_cholesky_1stage_toeplitz-test.$(OBJEXT)
+validate_complex_double_cholesky_1stage_toeplitz_OBJECTS = $(am_validate_complex_double_cholesky_1stage_toeplitz_OBJECTS)
+validate_complex_double_cholesky_1stage_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_cholesky_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_double_cholesky_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_cholesky_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_cholesky_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_cholesky_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvalues_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvalues_1stage_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvalues_1stage_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvalues_1stage_toeplitz_OBJECTS)
+validate_complex_double_eigenvalues_1stage_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_double_eigenvalues_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_1stage_analytic_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_analytic-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_analytic_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_analytic_OBJECTS)
+validate_complex_double_eigenvectors_1stage_analytic_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_1stage_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_analytic_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_eigenvectors_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_gpu_random_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_1stage_random_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_random-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_random_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_random_OBJECTS)
+validate_complex_double_eigenvectors_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_1stage_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_toeplitz_OBJECTS)
+validate_complex_double_eigenvectors_1stage_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_2stage_all_kernels_random_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_random_OBJECTS)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_scalapack_all_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_SCALAPACK_TESTS_TRUE@am_validate_complex_double_eigenvectors_scalapack_all_analytic_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic-test.$(OBJEXT)
+validate_complex_double_eigenvectors_scalapack_all_analytic_OBJECTS = $(am_validate_complex_double_eigenvectors_scalapack_all_analytic_OBJECTS)
+@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_all_analytic_DEPENDENCIES =  \
+@WITH_SCALAPACK_TESTS_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_scalapack_all_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_scalapack_all_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am_validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_scalapack_part_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_SCALAPACK_TESTS_TRUE@am_validate_complex_double_eigenvectors_scalapack_part_analytic_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic-test.$(OBJEXT)
+validate_complex_double_eigenvectors_scalapack_part_analytic_OBJECTS = $(am_validate_complex_double_eigenvectors_scalapack_part_analytic_OBJECTS)
+@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_part_analytic_DEPENDENCIES =  \
+@WITH_SCALAPACK_TESTS_TRUE@	$(test_program_ldadd)
+validate_complex_double_eigenvectors_scalapack_part_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_scalapack_part_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am_validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS = test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts-test.$(OBJEXT)
+validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS = $(am_validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_generalized_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_generalized_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_double_generalized_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_double_generalized_1stage_gpu_random_OBJECTS = $(am_validate_complex_double_generalized_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_generalized_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_generalized_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_generalized_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_generalized_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_generalized_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_generalized_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_generalized_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_double_generalized_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_generalized_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_generalized_1stage_random_OBJECTS = test/Fortran/validate_complex_double_generalized_1stage_random-test.$(OBJEXT)
+validate_complex_double_generalized_1stage_random_OBJECTS = $(am_validate_complex_double_generalized_1stage_random_OBJECTS)
+validate_complex_double_generalized_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_generalized_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_double_generalized_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_generalized_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_generalized_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_generalized_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_generalized_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_double_generalized_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_generalized_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_generalized_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_generalized_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_generalized_decomp_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS = $(am_validate_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_generalized_decomp_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_generalized_decomp_1stage_random_OBJECTS = test/Fortran/validate_complex_double_generalized_decomp_1stage_random-test.$(OBJEXT)
+validate_complex_double_generalized_decomp_1stage_random_OBJECTS = $(am_validate_complex_double_generalized_decomp_1stage_random_OBJECTS)
+validate_complex_double_generalized_decomp_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_generalized_decomp_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_generalized_decomp_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_generalized_decomp_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_generalized_decomp_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_generalized_decomp_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_double_generalized_decomp_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_hermitian_multiply_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_complex_double_hermitian_multiply_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_double_hermitian_multiply_1stage_gpu_random_OBJECTS = $(am_validate_complex_double_hermitian_multiply_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_hermitian_multiply_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_hermitian_multiply_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_complex_double_hermitian_multiply_1stage_random_OBJECTS = test/Fortran/validate_complex_double_hermitian_multiply_1stage_random-test.$(OBJEXT)
+validate_complex_double_hermitian_multiply_1stage_random_OBJECTS = $(am_validate_complex_double_hermitian_multiply_1stage_random_OBJECTS)
+validate_complex_double_hermitian_multiply_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_hermitian_multiply_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_double_hermitian_multiply_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_complex_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_double_hermitian_multiply_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_complex_double_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_cholesky_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_cholesky_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_single_cholesky_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_single_cholesky_1stage_gpu_random_OBJECTS = $(am_validate_complex_single_cholesky_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_cholesky_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_gpu_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_single_cholesky_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_cholesky_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_cholesky_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_cholesky_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_cholesky_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_single_cholesky_1stage_gpu_toeplitz_OBJECTS = $(am_validate_complex_single_cholesky_1stage_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_cholesky_1stage_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_cholesky_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_cholesky_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_cholesky_1stage_random_OBJECTS = test/Fortran/validate_complex_single_cholesky_1stage_random-test.$(OBJEXT)
+validate_complex_single_cholesky_1stage_random_OBJECTS =  \
+	$(am_validate_complex_single_cholesky_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_cholesky_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_cholesky_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_single_cholesky_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_cholesky_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_cholesky_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_cholesky_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_cholesky_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_single_cholesky_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_cholesky_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_cholesky_1stage_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_cholesky_1stage_toeplitz_OBJECTS = test/Fortran/validate_complex_single_cholesky_1stage_toeplitz-test.$(OBJEXT)
+validate_complex_single_cholesky_1stage_toeplitz_OBJECTS = $(am_validate_complex_single_cholesky_1stage_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_cholesky_1stage_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_cholesky_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_single_cholesky_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_cholesky_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_cholesky_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvalues_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvalues_1stage_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvalues_1stage_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvalues_1stage_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvalues_1stage_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvalues_1stage_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvalues_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_single_eigenvalues_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_1stage_analytic_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_analytic-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_analytic_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_analytic_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_analytic_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_analytic_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_eigenvectors_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_gpu_random_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_1stage_random_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_random-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_random_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_1stage_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_random_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_random_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT)
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = $(am_validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_generalized_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_generalized_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_single_generalized_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_single_generalized_1stage_gpu_random_OBJECTS = $(am_validate_complex_single_generalized_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_generalized_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_generalized_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_generalized_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_generalized_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_generalized_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_generalized_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_generalized_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_single_generalized_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_generalized_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_generalized_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_generalized_1stage_random_OBJECTS = test/Fortran/validate_complex_single_generalized_1stage_random-test.$(OBJEXT)
+validate_complex_single_generalized_1stage_random_OBJECTS = $(am_validate_complex_single_generalized_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_generalized_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_generalized_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_complex_single_generalized_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_generalized_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_generalized_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_generalized_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_generalized_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_single_generalized_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_generalized_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_generalized_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_generalized_decomp_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS = $(am_validate_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_generalized_decomp_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_generalized_decomp_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_generalized_decomp_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_generalized_decomp_1stage_random_OBJECTS = test/Fortran/validate_complex_single_generalized_decomp_1stage_random-test.$(OBJEXT)
+validate_complex_single_generalized_decomp_1stage_random_OBJECTS = $(am_validate_complex_single_generalized_decomp_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_generalized_decomp_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_generalized_decomp_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_generalized_decomp_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_generalized_decomp_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_generalized_decomp_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_generalized_decomp_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_generalized_decomp_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_single_generalized_decomp_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_generalized_decomp_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_hermitian_multiply_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_complex_single_hermitian_multiply_1stage_gpu_random_OBJECTS = test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random-test.$(OBJEXT)
+validate_complex_single_hermitian_multiply_1stage_gpu_random_OBJECTS = $(am_validate_complex_single_hermitian_multiply_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_hermitian_multiply_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_hermitian_multiply_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_hermitian_multiply_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_hermitian_multiply_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_complex_single_hermitian_multiply_1stage_random_OBJECTS = test/Fortran/validate_complex_single_hermitian_multiply_1stage_random-test.$(OBJEXT)
+validate_complex_single_hermitian_multiply_1stage_random_OBJECTS = $(am_validate_complex_single_hermitian_multiply_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_hermitian_multiply_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_complex_single_hermitian_multiply_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_hermitian_multiply_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_complex_single_hermitian_multiply_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@am_validate_complex_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_complex_single_hermitian_multiply_1stage_random_all_layouts-test.$(OBJEXT)
+validate_complex_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS = $(am_validate_complex_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_complex_single_hermitian_multiply_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_double_instance@SUFFIX@_OBJECTS = test/Fortran/elpa2/validate_double_instance@SUFFIX@-double_instance.$(OBJEXT)
+validate_double_instance@SUFFIX@_OBJECTS =  \
+	$(am_validate_double_instance@SUFFIX@_OBJECTS)
+validate_double_instance@SUFFIX@_DEPENDENCIES = $(test_program_ldadd)
+validate_double_instance@SUFFIX@_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_double_instance@SUFFIX@_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_multiple_objs_real_double_SOURCES_DIST =  \
+	test/Fortran/test_multiple_objs.F90
+@ENABLE_AUTOTUNING_TRUE@am_validate_multiple_objs_real_double_OBJECTS = test/Fortran/validate_multiple_objs_real_double-test_multiple_objs.$(OBJEXT)
+validate_multiple_objs_real_double_OBJECTS =  \
+	$(am_validate_multiple_objs_real_double_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@validate_multiple_objs_real_double_DEPENDENCIES =  \
+@ENABLE_AUTOTUNING_TRUE@	$(test_program_ldadd)
+validate_multiple_objs_real_double_LINK = $(LIBTOOL) $(AM_V_lt) \
+	--tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
+	$(FCLD) $(validate_multiple_objs_real_double_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_multiple_objs_real_double_c_version_SOURCES_DIST =  \
+	test/C/test_multiple_objs.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@am_validate_multiple_objs_real_double_c_version_OBJECTS = test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.$(OBJEXT)
+validate_multiple_objs_real_double_c_version_OBJECTS =  \
+	$(am_validate_multiple_objs_real_double_c_version_OBJECTS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_multiple_objs_real_double_c_version_DEPENDENCIES = $(test_program_ldadd) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@	$(am__DEPENDENCIES_1)
+validate_multiple_objs_real_double_c_version_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) \
+	$(validate_multiple_objs_real_double_c_version_CFLAGS) \
+	$(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_2stage_banded@SUFFIX@_OBJECTS = test/Fortran/elpa2/validate_real_2stage_banded@SUFFIX@-real_2stage_banded.$(OBJEXT)
+validate_real_2stage_banded@SUFFIX@_OBJECTS =  \
+	$(am_validate_real_2stage_banded@SUFFIX@_OBJECTS)
+validate_real_2stage_banded@SUFFIX@_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_2stage_banded@SUFFIX@_LINK = $(LIBTOOL) $(AM_V_lt) \
+	--tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
+	$(FCLD) $(validate_real_2stage_banded@SUFFIX@_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_cholesky_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_cholesky_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_gpu_random-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_gpu_random_OBJECTS =  \
+	$(am_validate_real_double_cholesky_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_cholesky_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_cholesky_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_gpu_random_split_comm_myself-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_OBJECTS = $(am_validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_cholesky_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_cholesky_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_gpu_toeplitz_OBJECTS = $(am_validate_real_double_cholesky_1stage_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_cholesky_1stage_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_cholesky_1stage_random_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_random-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_random_OBJECTS =  \
+	$(am_validate_real_double_cholesky_1stage_random_OBJECTS)
+validate_real_double_cholesky_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_cholesky_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_cholesky_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_random_all_layouts_OBJECTS = $(am_validate_real_double_cholesky_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_cholesky_1stage_random_split_comm_myself_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_cholesky_1stage_random_split_comm_myself_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_random_split_comm_myself-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_random_split_comm_myself_OBJECTS = $(am_validate_real_double_cholesky_1stage_random_split_comm_myself_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_random_split_comm_myself_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_random_split_comm_myself_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_random_split_comm_myself_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_cholesky_1stage_toeplitz_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_toeplitz-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_toeplitz_OBJECTS =  \
+	$(am_validate_real_double_cholesky_1stage_toeplitz_OBJECTS)
+validate_real_double_cholesky_1stage_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_cholesky_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_cholesky_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_cholesky_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvalues_1stage_frank_OBJECTS = test/Fortran/validate_real_double_eigenvalues_1stage_frank-test.$(OBJEXT)
+validate_real_double_eigenvalues_1stage_frank_OBJECTS =  \
+	$(am_validate_real_double_eigenvalues_1stage_frank_OBJECTS)
+validate_real_double_eigenvalues_1stage_frank_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_frank_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_1stage_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_1stage_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvalues_1stage_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvalues_1stage_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvalues_1stage_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvalues_1stage_frank_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_frank_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_1stage_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_1stage_gpu_frank_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvalues_1stage_gpu_frank_OBJECTS = test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank-test.$(OBJEXT)
+validate_real_double_eigenvalues_1stage_gpu_frank_OBJECTS = $(am_validate_real_double_eigenvalues_1stage_gpu_frank_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_1stage_gpu_frank_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_gpu_frank_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_1stage_gpu_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS = $(am_validate_real_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvalues_1stage_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvalues_1stage_toeplitz_OBJECTS = $(am_validate_real_double_eigenvalues_1stage_toeplitz_OBJECTS)
+validate_real_double_eigenvalues_1stage_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvalues_2stage_default_kernel_frank_OBJECTS = test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank-test.$(OBJEXT)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_OBJECTS = $(am_validate_real_double_eigenvalues_2stage_default_kernel_frank_OBJECTS)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_OBJECTS = test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank-test.$(OBJEXT)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_OBJECTS = $(am_validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS = $(am_validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS = $(am_validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_1stage_analytic_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_analytic-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_analytic_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_analytic_OBJECTS)
+validate_real_double_eigenvectors_1stage_analytic_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_analytic_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_analytic_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_1stage_frank_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_frank-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_frank_OBJECTS =  \
+	$(am_validate_real_double_eigenvectors_1stage_frank_OBJECTS)
+validate_real_double_eigenvectors_1stage_frank_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_frank_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_frank_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_frank_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_gpu_frank_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_1stage_gpu_frank_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_gpu_frank_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_gpu_frank_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_frank_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_frank_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_gpu_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_gpu_random_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_1stage_random_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_random-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_random_OBJECTS =  \
+	$(am_validate_real_double_eigenvectors_1stage_random_OBJECTS)
+validate_real_double_eigenvectors_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_random_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_random_split_comm_myself_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_random_split_comm_myself_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_random_split_comm_myself-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_random_split_comm_myself_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_random_split_comm_myself_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_random_split_comm_myself_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_random_split_comm_myself_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_random_split_comm_myself_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_1stage_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_toeplitz_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_toeplitz_OBJECTS)
+validate_real_double_eigenvectors_1stage_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_all_kernels_frank_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_frank_OBJECTS)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_all_kernels_qr_random_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_qr_random_OBJECTS)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_all_kernels_random_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_random_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_random_OBJECTS)
+validate_real_double_eigenvectors_2stage_all_kernels_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_default_kernel_frank_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_frank_OBJECTS)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_default_kernel_qr_random_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_qr_random_OBJECTS)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS)
+validate_real_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_scalapack_all_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_SCALAPACK_TESTS_TRUE@am_validate_real_double_eigenvectors_scalapack_all_analytic_OBJECTS = test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic-test.$(OBJEXT)
+validate_real_double_eigenvectors_scalapack_all_analytic_OBJECTS = $(am_validate_real_double_eigenvectors_scalapack_all_analytic_OBJECTS)
+@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_all_analytic_DEPENDENCIES =  \
+@WITH_SCALAPACK_TESTS_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_scalapack_all_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_scalapack_all_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am_validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_scalapack_part_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_SCALAPACK_TESTS_TRUE@am_validate_real_double_eigenvectors_scalapack_part_analytic_OBJECTS = test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic-test.$(OBJEXT)
+validate_real_double_eigenvectors_scalapack_part_analytic_OBJECTS = $(am_validate_real_double_eigenvectors_scalapack_part_analytic_OBJECTS)
+@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_part_analytic_DEPENDENCIES =  \
+@WITH_SCALAPACK_TESTS_TRUE@	$(test_program_ldadd)
+validate_real_double_eigenvectors_scalapack_part_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_scalapack_part_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@am_validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS = test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts-test.$(OBJEXT)
+validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS = $(am_validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_generalized_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_generalized_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_double_generalized_1stage_gpu_random-test.$(OBJEXT)
+validate_real_double_generalized_1stage_gpu_random_OBJECTS = $(am_validate_real_double_generalized_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_generalized_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_generalized_1stage_gpu_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_generalized_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_generalized_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_generalized_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_generalized_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_double_generalized_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_double_generalized_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_generalized_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_generalized_1stage_random_OBJECTS = test/Fortran/validate_real_double_generalized_1stage_random-test.$(OBJEXT)
+validate_real_double_generalized_1stage_random_OBJECTS =  \
+	$(am_validate_real_double_generalized_1stage_random_OBJECTS)
+validate_real_double_generalized_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_generalized_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_double_generalized_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_generalized_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_generalized_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_generalized_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_double_generalized_1stage_random_all_layouts_OBJECTS = $(am_validate_real_double_generalized_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_generalized_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_generalized_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_generalized_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_generalized_decomp_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_generalized_decomp_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random-test.$(OBJEXT)
+validate_real_double_generalized_decomp_1stage_gpu_random_OBJECTS = $(am_validate_real_double_generalized_decomp_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_generalized_decomp_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_generalized_decomp_1stage_random_OBJECTS = test/Fortran/validate_real_double_generalized_decomp_1stage_random-test.$(OBJEXT)
+validate_real_double_generalized_decomp_1stage_random_OBJECTS = $(am_validate_real_double_generalized_decomp_1stage_random_OBJECTS)
+validate_real_double_generalized_decomp_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_generalized_decomp_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_generalized_decomp_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_generalized_decomp_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_generalized_decomp_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_double_generalized_decomp_1stage_random_all_layouts_OBJECTS = $(am_validate_real_double_generalized_decomp_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_hermitian_multiply_1stage_frank_OBJECTS = test/Fortran/validate_real_double_hermitian_multiply_1stage_frank-test.$(OBJEXT)
+validate_real_double_hermitian_multiply_1stage_frank_OBJECTS = $(am_validate_real_double_hermitian_multiply_1stage_frank_OBJECTS)
+validate_real_double_hermitian_multiply_1stage_frank_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_frank_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_hermitian_multiply_1stage_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_hermitian_multiply_1stage_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_hermitian_multiply_1stage_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_hermitian_multiply_1stage_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_hermitian_multiply_1stage_frank_all_layouts_OBJECTS = $(am_validate_real_double_hermitian_multiply_1stage_frank_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_frank_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_hermitian_multiply_1stage_gpu_frank_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_hermitian_multiply_1stage_gpu_frank_OBJECTS = test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank-test.$(OBJEXT)
+validate_real_double_hermitian_multiply_1stage_gpu_frank_OBJECTS = $(am_validate_real_double_hermitian_multiply_1stage_gpu_frank_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_frank_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_gpu_frank_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_hermitian_multiply_1stage_gpu_frank_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_OBJECTS = test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts-test.$(OBJEXT)
+validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_OBJECTS = $(am_validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_hermitian_multiply_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_hermitian_multiply_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random-test.$(OBJEXT)
+validate_real_double_hermitian_multiply_1stage_gpu_random_OBJECTS = $(am_validate_real_double_hermitian_multiply_1stage_gpu_random_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_random_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_hermitian_multiply_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_hermitian_multiply_1stage_random_OBJECTS = test/Fortran/validate_real_double_hermitian_multiply_1stage_random-test.$(OBJEXT)
+validate_real_double_hermitian_multiply_1stage_random_OBJECTS = $(am_validate_real_double_hermitian_multiply_1stage_random_OBJECTS)
+validate_real_double_hermitian_multiply_1stage_random_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_hermitian_multiply_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_hermitian_multiply_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_double_hermitian_multiply_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS = $(am_validate_real_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@am_validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS = $(am_validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS)
+@WITH_GPU_VERSION_TRUE@validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@	$(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_real_double_solve_tridiagonal_1stage_toeplitz_OBJECTS = test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz-test.$(OBJEXT)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_OBJECTS = $(am_validate_real_double_solve_tridiagonal_1stage_toeplitz_OBJECTS)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_DEPENDENCIES =  \
+	$(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_solve_tridiagonal_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WITH_MPI_TRUE@am_validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS)
+@WITH_MPI_TRUE@validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_DEPENDENCIES =  \
+@WITH_MPI_TRUE@	$(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_cholesky_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_cholesky_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_single_cholesky_1stage_gpu_random-test.$(OBJEXT)
+validate_real_single_cholesky_1stage_gpu_random_OBJECTS =  \
+	$(am_validate_real_single_cholesky_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_cholesky_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_gpu_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_cholesky_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_cholesky_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_cholesky_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_cholesky_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_cholesky_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_real_single_cholesky_1stage_gpu_toeplitz_OBJECTS = $(am_validate_real_single_cholesky_1stage_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_cholesky_1stage_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_gpu_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_cholesky_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_cholesky_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_cholesky_1stage_random_OBJECTS = test/Fortran/validate_real_single_cholesky_1stage_random-test.$(OBJEXT)
+validate_real_single_cholesky_1stage_random_OBJECTS =  \
+	$(am_validate_real_single_cholesky_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_cholesky_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_cholesky_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_cholesky_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_cholesky_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_cholesky_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_cholesky_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_single_cholesky_1stage_random_all_layouts_OBJECTS = $(am_validate_real_single_cholesky_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_cholesky_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_cholesky_1stage_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_cholesky_1stage_toeplitz_OBJECTS = test/Fortran/validate_real_single_cholesky_1stage_toeplitz-test.$(OBJEXT)
+validate_real_single_cholesky_1stage_toeplitz_OBJECTS =  \
+	$(am_validate_real_single_cholesky_1stage_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_cholesky_1stage_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_cholesky_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_cholesky_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_cholesky_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_cholesky_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvalues_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS = $(am_validate_real_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvalues_1stage_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvalues_1stage_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvalues_1stage_toeplitz_OBJECTS = $(am_validate_real_single_eigenvalues_1stage_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvalues_1stage_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvalues_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_eigenvalues_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS = $(am_validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS = $(am_validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_1stage_analytic_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_analytic-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_analytic_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_analytic_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_analytic_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_analytic_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_analytic_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_eigenvectors_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_gpu_random_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_gpu_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_1stage_random_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_random-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_random_OBJECTS =  \
+	$(am_validate_real_single_eigenvectors_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_random_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_1stage_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_toeplitz_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_toeplitz_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_qr_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_qr_random_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_qr_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_qr_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_random_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_random_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_analytic_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_qr_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_qr_random_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_qr_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_qr_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_generalized_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_generalized_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_single_generalized_1stage_gpu_random-test.$(OBJEXT)
+validate_real_single_generalized_1stage_gpu_random_OBJECTS = $(am_validate_real_single_generalized_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_generalized_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_generalized_1stage_gpu_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_generalized_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_generalized_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_generalized_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_generalized_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_single_generalized_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_single_generalized_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_generalized_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_generalized_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_generalized_1stage_random_OBJECTS = test/Fortran/validate_real_single_generalized_1stage_random-test.$(OBJEXT)
+validate_real_single_generalized_1stage_random_OBJECTS =  \
+	$(am_validate_real_single_generalized_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_generalized_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_generalized_1stage_random_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_real_single_generalized_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_generalized_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_generalized_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_generalized_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_single_generalized_1stage_random_all_layouts_OBJECTS = $(am_validate_real_single_generalized_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_generalized_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_generalized_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_generalized_decomp_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_generalized_decomp_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random-test.$(OBJEXT)
+validate_real_single_generalized_decomp_1stage_gpu_random_OBJECTS = $(am_validate_real_single_generalized_decomp_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_generalized_decomp_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_generalized_decomp_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_generalized_decomp_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_generalized_decomp_1stage_random_OBJECTS = test/Fortran/validate_real_single_generalized_decomp_1stage_random-test.$(OBJEXT)
+validate_real_single_generalized_decomp_1stage_random_OBJECTS = $(am_validate_real_single_generalized_decomp_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_generalized_decomp_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_generalized_decomp_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_generalized_decomp_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_generalized_decomp_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_generalized_decomp_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_generalized_decomp_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_single_generalized_decomp_1stage_random_all_layouts_OBJECTS = $(am_validate_real_single_generalized_decomp_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_generalized_decomp_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_hermitian_multiply_1stage_gpu_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_hermitian_multiply_1stage_gpu_random_OBJECTS = test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random-test.$(OBJEXT)
+validate_real_single_hermitian_multiply_1stage_gpu_random_OBJECTS = $(am_validate_real_single_hermitian_multiply_1stage_gpu_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_hermitian_multiply_1stage_gpu_random_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_hermitian_multiply_1stage_gpu_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_hermitian_multiply_1stage_gpu_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.$(OBJEXT)
+validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS = $(am_validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_hermitian_multiply_1stage_random_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_hermitian_multiply_1stage_random_OBJECTS = test/Fortran/validate_real_single_hermitian_multiply_1stage_random-test.$(OBJEXT)
+validate_real_single_hermitian_multiply_1stage_random_OBJECTS = $(am_validate_real_single_hermitian_multiply_1stage_random_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_hermitian_multiply_1stage_random_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_hermitian_multiply_1stage_random_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_hermitian_multiply_1stage_random_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_hermitian_multiply_1stage_random_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS = test/Fortran/validate_real_single_hermitian_multiply_1stage_random_all_layouts-test.$(OBJEXT)
+validate_real_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS = $(am_validate_real_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_hermitian_multiply_1stage_random_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@am_validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS = test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz-test.$(OBJEXT)
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS = $(am_validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@am_validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_solve_tridiagonal_1stage_toeplitz_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_real_single_solve_tridiagonal_1stage_toeplitz_OBJECTS = test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz-test.$(OBJEXT)
+validate_real_single_solve_tridiagonal_1stage_toeplitz_OBJECTS = $(am_validate_real_single_solve_tridiagonal_1stage_toeplitz_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_solve_tridiagonal_1stage_toeplitz_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_real_single_solve_tridiagonal_1stage_toeplitz_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_solve_tridiagonal_1stage_toeplitz_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES_DIST =  \
+	test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@am_validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS = test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts-test.$(OBJEXT)
+validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS = $(am_validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_DEPENDENCIES = $(test_program_ldadd)
+validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_LINK =  \
+	$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_single_complex_2stage_banded@SUFFIX@_SOURCES_DIST =  \
+	test/Fortran/elpa2/single_complex_2stage_banded.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@am_validate_single_complex_2stage_banded@SUFFIX@_OBJECTS = test/Fortran/elpa2/validate_single_complex_2stage_banded@SUFFIX@-single_complex_2stage_banded.$(OBJEXT)
+validate_single_complex_2stage_banded@SUFFIX@_OBJECTS =  \
+	$(am_validate_single_complex_2stage_banded@SUFFIX@_OBJECTS)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_single_complex_2stage_banded@SUFFIX@_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@	$(test_program_ldadd)
+validate_single_complex_2stage_banded@SUFFIX@_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_single_complex_2stage_banded@SUFFIX@_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__validate_single_real_2stage_banded@SUFFIX@_SOURCES_DIST =  \
+	test/Fortran/elpa2/single_real_2stage_banded.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@am_validate_single_real_2stage_banded@SUFFIX@_OBJECTS = test/Fortran/elpa2/validate_single_real_2stage_banded@SUFFIX@-single_real_2stage_banded.$(OBJEXT)
+validate_single_real_2stage_banded@SUFFIX@_OBJECTS =  \
+	$(am_validate_single_real_2stage_banded@SUFFIX@_OBJECTS)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_single_real_2stage_banded@SUFFIX@_DEPENDENCIES =  \
+@WANT_SINGLE_PRECISION_REAL_TRUE@	$(test_program_ldadd)
+validate_single_real_2stage_banded@SUFFIX@_LINK = $(LIBTOOL) \
+	$(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(FCLD) \
+	$(validate_single_real_2stage_banded@SUFFIX@_FCFLAGS) \
+	$(FCFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_validate_split_comm_real_double_OBJECTS = test/Fortran/validate_split_comm_real_double-test_split_comm.$(OBJEXT)
+validate_split_comm_real_double_OBJECTS =  \
+	$(am_validate_split_comm_real_double_OBJECTS)
+validate_split_comm_real_double_DEPENDENCIES = $(test_program_ldadd)
+validate_split_comm_real_double_LINK = $(LIBTOOL) $(AM_V_lt) --tag=FC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(FCLD) \
+	$(validate_split_comm_real_double_FCFLAGS) $(FCFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 AM_V_P = $(am__v_P_@AM_V@)
 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
 am__v_P_0 = false
@@ -556,7 +5915,101 @@
 am__v_at_1 = 
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/depcomp
-am__depfiles_maybe = depfiles
+am__maybe_remake_depfiles = depfiles
+am__depfiles_remade = python/pyelpa/$(DEPDIR)/wrapper_la-wrapper.Plo \
+	src/$(DEPDIR)/elpa_c_interface.Plo \
+	src/$(DEPDIR)/elpa_index.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_1hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_1hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_2hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_avx512_1hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_avx512_1hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_avx512_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_avx512_2hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_sse_1hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_sse_1hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_sse_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/complex_sse_2hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_2hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_4hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_4hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_6hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_6hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx512_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx512_2hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx512_4hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx512_4hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx512_6hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_avx512_6hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_2hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_4hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_4hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_6hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_6hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sparc64_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sparc64_4hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sparc64_6hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sse_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sse_2hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sse_4hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sse_4hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sse_6hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_sse_6hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_vsx_2hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_vsx_2hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_vsx_4hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_vsx_4hv_single_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_vsx_6hv_double_precision.Plo \
+	src/elpa2/kernels/$(DEPDIR)/real_vsx_6hv_single_precision.Plo \
+	src/elpa_generalized/$(DEPDIR)/cannon.Plo \
+	src/ftimings/$(DEPDIR)/highwater_mark.Plo \
+	src/ftimings/$(DEPDIR)/papi.Plo \
+	src/ftimings/$(DEPDIR)/resident_set_size.Plo \
+	src/ftimings/$(DEPDIR)/time.Plo \
+	src/ftimings/$(DEPDIR)/virtual_memory.Plo \
+	src/helpers/$(DEPDIR)/get_cpuid_set.Plo \
+	src/helpers/$(DEPDIR)/print_build_config.Plo \
+	test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Po \
+	test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Po \
+	test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Po \
+	test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Po \
+	test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Po \
+	test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Po \
+	test/shared/$(DEPDIR)/test_redir.Plo
 am__mv = mv -f
 PPFCCOMPILE = $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
 	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_FCFLAGS) $(FCFLAGS)
@@ -568,7 +6021,6 @@
 am__v_PPFC_ = $(am__v_PPFC_@AM_DEFAULT_V@)
 am__v_PPFC_0 = @echo "  PPFC    " $@;
 am__v_PPFC_1 = 
-FCLD = $(FC)
 FCLINK = $(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(FCLD) $(AM_FCFLAGS) $(FCFLAGS) \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
@@ -602,56 +6054,710 @@
 am__v_FC_0 = @echo "  FC      " $@;
 am__v_FC_1 = 
 CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
-LTCCASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \
-	$(CCASFLAGS)
 AM_V_CCAS = $(am__v_CCAS_@AM_V@)
 am__v_CCAS_ = $(am__v_CCAS_@AM_DEFAULT_V@)
 am__v_CCAS_0 = @echo "  CCAS    " $@;
 am__v_CCAS_1 = 
 SOURCES = $(libelpa@SUFFIX@_la_SOURCES) \
-	$(elpa1_test_complex@SUFFIX@_SOURCES) \
-	$(elpa1_test_complex_c_version@SUFFIX@_SOURCES) \
-	$(elpa1_test_real@SUFFIX@_SOURCES) \
-	$(elpa1_test_real_c_version@SUFFIX@_SOURCES) \
-	$(elpa1_test_real_with_c@SUFFIX@_SOURCES) \
+	$(libelpa@SUFFIX@_private_la_SOURCES) \
+	$(libelpa@SUFFIX@_public_la_SOURCES) \
+	$(libelpatest@SUFFIX@_la_SOURCES) $(nodist_wrapper_la_SOURCES) \
+	$(elpa2_print_kernels@SUFFIX@_SOURCES) \
+	$(test_skewsymmetric_real_double_SOURCES) \
+	$(test_skewsymmetric_real_single_SOURCES) \
+	$(validate_autotune_c_version_complex_double_SOURCES) \
+	$(validate_autotune_c_version_complex_single_SOURCES) \
+	$(validate_autotune_c_version_real_double_SOURCES) \
+	$(validate_autotune_c_version_real_single_SOURCES) \
+	$(validate_autotune_complex_double_SOURCES) \
+	$(validate_autotune_complex_single_SOURCES) \
+	$(validate_autotune_real_double_SOURCES) \
+	$(validate_autotune_real_single_SOURCES) \
+	$(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_complex_double_eigenvectors_1stage_random_SOURCES) \
+	$(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES) \
+	$(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(validate_c_version_complex_double_generalized_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_complex_double_generalized_1stage_random_SOURCES) \
+	$(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_complex_double_generalized_decomp_1stage_random_SOURCES) \
+	$(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_complex_single_eigenvectors_1stage_random_SOURCES) \
+	$(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES) \
+	$(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(validate_c_version_complex_single_generalized_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_complex_single_generalized_1stage_random_SOURCES) \
+	$(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_complex_single_generalized_decomp_1stage_random_SOURCES) \
+	$(validate_c_version_real_double_eigenvectors_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_real_double_eigenvectors_1stage_random_SOURCES) \
+	$(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES) \
+	$(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(validate_c_version_real_double_generalized_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_real_double_generalized_1stage_random_SOURCES) \
+	$(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_real_double_generalized_decomp_1stage_random_SOURCES) \
+	$(validate_c_version_real_single_eigenvectors_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_real_single_eigenvectors_1stage_random_SOURCES) \
+	$(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES) \
+	$(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(validate_c_version_real_single_generalized_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_real_single_generalized_1stage_random_SOURCES) \
+	$(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_SOURCES) \
+	$(validate_c_version_real_single_generalized_decomp_1stage_random_SOURCES) \
+	$(validate_complex_2stage_banded@SUFFIX@_SOURCES) \
+	$(validate_complex_double_cholesky_1stage_gpu_random_SOURCES) \
+	$(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_double_cholesky_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_cholesky_1stage_random_SOURCES) \
+	$(validate_complex_double_cholesky_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_double_cholesky_1stage_toeplitz_SOURCES) \
+	$(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvalues_1stage_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_analytic_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_gpu_random_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_random_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_random_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_SOURCES) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_scalapack_all_analytic_SOURCES) \
+	$(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES) \
+	$(validate_complex_double_eigenvectors_scalapack_part_analytic_SOURCES) \
+	$(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES) \
+	$(validate_complex_double_generalized_1stage_gpu_random_SOURCES) \
+	$(validate_complex_double_generalized_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_double_generalized_1stage_random_SOURCES) \
+	$(validate_complex_double_generalized_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_double_generalized_decomp_1stage_gpu_random_SOURCES) \
+	$(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_double_generalized_decomp_1stage_random_SOURCES) \
+	$(validate_complex_double_generalized_decomp_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_double_hermitian_multiply_1stage_gpu_random_SOURCES) \
+	$(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_double_hermitian_multiply_1stage_random_SOURCES) \
+	$(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_single_cholesky_1stage_gpu_random_SOURCES) \
+	$(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_single_cholesky_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_cholesky_1stage_random_SOURCES) \
+	$(validate_complex_single_cholesky_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_single_cholesky_1stage_toeplitz_SOURCES) \
+	$(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvalues_1stage_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_analytic_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_gpu_random_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_random_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_random_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES) \
+	$(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES) \
+	$(validate_complex_single_generalized_1stage_gpu_random_SOURCES) \
+	$(validate_complex_single_generalized_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_single_generalized_1stage_random_SOURCES) \
+	$(validate_complex_single_generalized_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_single_generalized_decomp_1stage_gpu_random_SOURCES) \
+	$(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_single_generalized_decomp_1stage_random_SOURCES) \
+	$(validate_complex_single_generalized_decomp_1stage_random_all_layouts_SOURCES) \
+	$(validate_complex_single_hermitian_multiply_1stage_gpu_random_SOURCES) \
+	$(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_complex_single_hermitian_multiply_1stage_random_SOURCES) \
+	$(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_SOURCES) \
+	$(validate_double_instance@SUFFIX@_SOURCES) \
+	$(validate_multiple_objs_real_double_SOURCES) \
+	$(validate_multiple_objs_real_double_c_version_SOURCES) \
+	$(validate_real_2stage_banded@SUFFIX@_SOURCES) \
+	$(validate_real_double_cholesky_1stage_gpu_random_SOURCES) \
+	$(validate_real_double_cholesky_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_SOURCES) \
+	$(validate_real_double_cholesky_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_cholesky_1stage_random_SOURCES) \
+	$(validate_real_double_cholesky_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_double_cholesky_1stage_random_split_comm_myself_SOURCES) \
+	$(validate_real_double_cholesky_1stage_toeplitz_SOURCES) \
+	$(validate_real_double_cholesky_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvalues_1stage_frank_SOURCES) \
+	$(validate_real_double_eigenvalues_1stage_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvalues_1stage_gpu_frank_SOURCES) \
+	$(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvalues_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvalues_1stage_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_frank_SOURCES) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_SOURCES) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_analytic_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_analytic_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_frank_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_gpu_frank_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_gpu_random_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_random_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_random_split_comm_myself_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_analytic_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_frank_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_random_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_analytic_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_frank_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_SOURCES) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_scalapack_all_analytic_SOURCES) \
+	$(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES) \
+	$(validate_real_double_eigenvectors_scalapack_part_analytic_SOURCES) \
+	$(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES) \
+	$(validate_real_double_generalized_1stage_gpu_random_SOURCES) \
+	$(validate_real_double_generalized_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_double_generalized_1stage_random_SOURCES) \
+	$(validate_real_double_generalized_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_double_generalized_decomp_1stage_gpu_random_SOURCES) \
+	$(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_double_generalized_decomp_1stage_random_SOURCES) \
+	$(validate_real_double_generalized_decomp_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_double_hermitian_multiply_1stage_frank_SOURCES) \
+	$(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_SOURCES) \
+	$(validate_real_double_hermitian_multiply_1stage_gpu_frank_SOURCES) \
+	$(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_SOURCES) \
+	$(validate_real_double_hermitian_multiply_1stage_gpu_random_SOURCES) \
+	$(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_double_hermitian_multiply_1stage_random_SOURCES) \
+	$(validate_real_double_hermitian_multiply_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_double_solve_tridiagonal_1stage_toeplitz_SOURCES) \
+	$(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_cholesky_1stage_gpu_random_SOURCES) \
+	$(validate_real_single_cholesky_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_single_cholesky_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_cholesky_1stage_random_SOURCES) \
+	$(validate_real_single_cholesky_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_single_cholesky_1stage_toeplitz_SOURCES) \
+	$(validate_real_single_cholesky_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvalues_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvalues_1stage_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_analytic_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_analytic_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_gpu_random_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_random_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_analytic_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_random_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_analytic_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES) \
+	$(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_generalized_1stage_gpu_random_SOURCES) \
+	$(validate_real_single_generalized_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_single_generalized_1stage_random_SOURCES) \
+	$(validate_real_single_generalized_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_single_generalized_decomp_1stage_gpu_random_SOURCES) \
+	$(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_single_generalized_decomp_1stage_random_SOURCES) \
+	$(validate_real_single_generalized_decomp_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_single_hermitian_multiply_1stage_gpu_random_SOURCES) \
+	$(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES) \
+	$(validate_real_single_hermitian_multiply_1stage_random_SOURCES) \
+	$(validate_real_single_hermitian_multiply_1stage_random_all_layouts_SOURCES) \
+	$(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES) \
+	$(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES) \
+	$(validate_real_single_solve_tridiagonal_1stage_toeplitz_SOURCES) \
+	$(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES) \
+	$(validate_single_complex_2stage_banded@SUFFIX@_SOURCES) \
+	$(validate_single_real_2stage_banded@SUFFIX@_SOURCES) \
+	$(validate_split_comm_real_double_SOURCES)
+DIST_SOURCES = $(libelpa@SUFFIX@_la_SOURCES) \
+	$(am__libelpa@SUFFIX@_private_la_SOURCES_DIST) \
+	$(libelpa@SUFFIX@_public_la_SOURCES) \
+	$(am__libelpatest@SUFFIX@_la_SOURCES_DIST) \
 	$(elpa2_print_kernels@SUFFIX@_SOURCES) \
-	$(elpa2_test_complex@SUFFIX@_SOURCES) \
-	$(elpa2_test_complex_c_version@SUFFIX@_SOURCES) \
-	$(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES) \
-	$(elpa2_test_complex_default_kernel@SUFFIX@_SOURCES) \
-	$(elpa2_test_real@SUFFIX@_SOURCES) \
-	$(elpa2_test_real_c_version@SUFFIX@_SOURCES) \
-	$(elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES) \
-	$(elpa2_test_real_default_kernel@SUFFIX@_SOURCES) \
-	$(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES)
-DIST_SOURCES = $(am__libelpa@SUFFIX@_la_SOURCES_DIST) \
-	$(am__elpa1_test_complex@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa1_test_complex_c_version@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa1_test_real@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa1_test_real_c_version@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa1_test_real_with_c@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_print_kernels@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_complex@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_complex_c_version@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_complex_default_kernel@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_real@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_real_c_version@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_real_default_kernel@SUFFIX@_SOURCES_DIST) \
-	$(am__elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES_DIST)
+	$(test_skewsymmetric_real_double_SOURCES) \
+	$(am__test_skewsymmetric_real_single_SOURCES_DIST) \
+	$(am__validate_autotune_c_version_complex_double_SOURCES_DIST) \
+	$(am__validate_autotune_c_version_complex_single_SOURCES_DIST) \
+	$(am__validate_autotune_c_version_real_double_SOURCES_DIST) \
+	$(am__validate_autotune_c_version_real_single_SOURCES_DIST) \
+	$(am__validate_autotune_complex_double_SOURCES_DIST) \
+	$(am__validate_autotune_complex_single_SOURCES_DIST) \
+	$(am__validate_autotune_real_double_SOURCES_DIST) \
+	$(am__validate_autotune_real_single_SOURCES_DIST) \
+	$(am__validate_c_version_complex_double_eigenvectors_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_double_eigenvectors_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_double_generalized_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_double_generalized_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_double_generalized_decomp_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_single_eigenvectors_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_single_eigenvectors_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_single_generalized_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_single_generalized_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_complex_single_generalized_decomp_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_double_eigenvectors_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_double_eigenvectors_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_double_generalized_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_double_generalized_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_double_generalized_decomp_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_double_generalized_decomp_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_single_eigenvectors_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_single_eigenvectors_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_single_generalized_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_single_generalized_1stage_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_single_generalized_decomp_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_c_version_real_single_generalized_decomp_1stage_random_SOURCES_DIST) \
+	$(validate_complex_2stage_banded@SUFFIX@_SOURCES) \
+	$(am__validate_complex_double_cholesky_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_double_cholesky_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_cholesky_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_cholesky_1stage_random_SOURCES) \
+	$(am__validate_complex_double_cholesky_1stage_random_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_cholesky_1stage_toeplitz_SOURCES) \
+	$(am__validate_complex_double_cholesky_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvalues_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvalues_1stage_toeplitz_SOURCES) \
+	$(am__validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_SOURCES) \
+	$(am__validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_1stage_analytic_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_1stage_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_1stage_random_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_1stage_random_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_1stage_toeplitz_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_random_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_SOURCES) \
+	$(am__validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_scalapack_all_analytic_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_scalapack_part_analytic_SOURCES_DIST) \
+	$(am__validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_generalized_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_double_generalized_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_generalized_1stage_random_SOURCES) \
+	$(am__validate_complex_double_generalized_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_generalized_decomp_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_generalized_decomp_1stage_random_SOURCES) \
+	$(am__validate_complex_double_generalized_decomp_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_double_hermitian_multiply_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(validate_complex_double_hermitian_multiply_1stage_random_SOURCES) \
+	$(am__validate_complex_double_hermitian_multiply_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_cholesky_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_single_cholesky_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_cholesky_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_cholesky_1stage_random_SOURCES_DIST) \
+	$(am__validate_complex_single_cholesky_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_cholesky_1stage_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_cholesky_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvalues_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvalues_1stage_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_analytic_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_random_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_analytic_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_random_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_analytic_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES_DIST) \
+	$(am__validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_generalized_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_single_generalized_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_generalized_1stage_random_SOURCES_DIST) \
+	$(am__validate_complex_single_generalized_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_generalized_decomp_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_generalized_decomp_1stage_random_SOURCES_DIST) \
+	$(am__validate_complex_single_generalized_decomp_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_hermitian_multiply_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_complex_single_hermitian_multiply_1stage_random_SOURCES_DIST) \
+	$(am__validate_complex_single_hermitian_multiply_1stage_random_all_layouts_SOURCES_DIST) \
+	$(validate_double_instance@SUFFIX@_SOURCES) \
+	$(am__validate_multiple_objs_real_double_SOURCES_DIST) \
+	$(am__validate_multiple_objs_real_double_c_version_SOURCES_DIST) \
+	$(validate_real_2stage_banded@SUFFIX@_SOURCES) \
+	$(am__validate_real_double_cholesky_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_double_cholesky_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_SOURCES_DIST) \
+	$(am__validate_real_double_cholesky_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_cholesky_1stage_random_SOURCES) \
+	$(am__validate_real_double_cholesky_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_cholesky_1stage_random_split_comm_myself_SOURCES_DIST) \
+	$(validate_real_double_cholesky_1stage_toeplitz_SOURCES) \
+	$(am__validate_real_double_cholesky_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvalues_1stage_frank_SOURCES) \
+	$(am__validate_real_double_eigenvalues_1stage_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvalues_1stage_gpu_frank_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvalues_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvalues_1stage_toeplitz_SOURCES) \
+	$(am__validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_frank_SOURCES) \
+	$(am__validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_SOURCES) \
+	$(am__validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_1stage_analytic_SOURCES) \
+	$(am__validate_real_double_eigenvectors_1stage_analytic_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_1stage_frank_SOURCES) \
+	$(am__validate_real_double_eigenvectors_1stage_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_1stage_gpu_frank_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_1stage_random_SOURCES) \
+	$(am__validate_real_double_eigenvectors_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_1stage_random_split_comm_myself_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_1stage_toeplitz_SOURCES) \
+	$(am__validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_analytic_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_frank_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_random_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_analytic_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_frank_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_random_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_SOURCES_DIST) \
+	$(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_SOURCES) \
+	$(am__validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_scalapack_all_analytic_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_scalapack_part_analytic_SOURCES_DIST) \
+	$(am__validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_generalized_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_double_generalized_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_generalized_1stage_random_SOURCES) \
+	$(am__validate_real_double_generalized_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_generalized_decomp_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_generalized_decomp_1stage_random_SOURCES) \
+	$(am__validate_real_double_generalized_decomp_1stage_random_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_hermitian_multiply_1stage_frank_SOURCES) \
+	$(am__validate_real_double_hermitian_multiply_1stage_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_hermitian_multiply_1stage_gpu_frank_SOURCES_DIST) \
+	$(am__validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_hermitian_multiply_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_hermitian_multiply_1stage_random_SOURCES) \
+	$(am__validate_real_double_hermitian_multiply_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(validate_real_double_solve_tridiagonal_1stage_toeplitz_SOURCES) \
+	$(am__validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_cholesky_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_single_cholesky_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_cholesky_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_cholesky_1stage_random_SOURCES_DIST) \
+	$(am__validate_real_single_cholesky_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_cholesky_1stage_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_cholesky_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvalues_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvalues_1stage_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_analytic_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_random_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_analytic_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_qr_random_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_random_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_analytic_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_qr_random_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_random_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_generalized_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_single_generalized_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_generalized_1stage_random_SOURCES_DIST) \
+	$(am__validate_real_single_generalized_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_generalized_decomp_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_generalized_decomp_1stage_random_SOURCES_DIST) \
+	$(am__validate_real_single_generalized_decomp_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_hermitian_multiply_1stage_gpu_random_SOURCES_DIST) \
+	$(am__validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_hermitian_multiply_1stage_random_SOURCES_DIST) \
+	$(am__validate_real_single_hermitian_multiply_1stage_random_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_real_single_solve_tridiagonal_1stage_toeplitz_SOURCES_DIST) \
+	$(am__validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES_DIST) \
+	$(am__validate_single_complex_2stage_banded@SUFFIX@_SOURCES_DIST) \
+	$(am__validate_single_real_2stage_banded@SUFFIX@_SOURCES_DIST) \
+	$(validate_split_comm_real_double_SOURCES)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
     *) (install-info --version) >/dev/null 2>&1;; \
   esac
+am__pyelpa_PYTHON_DIST = python/pyelpa/__init__.py \
+	python/pyelpa/distributedmatrix.py
+am__py_compile = PYTHON=$(PYTHON) $(SHELL) $(py_compile)
+am__pep3147_tweak = \
+  sed -e 's|\.py$$||' -e 's|[^/]*$$|&.*.pyc\n&.*.pyo|'
+py_compile = $(top_srcdir)/py-compile
 man1dir = $(mandir)/man1
 man3dir = $(mandir)/man3
 NROFF = nroff
 MANS = $(dist_man_MANS)
-DATA = $(dist_doc_DATA) $(dist_files_DATA) $(pkgconfig_DATA)
-HEADERS = $(nobase_elpa_include_HEADERS)
+DATA = $(dist_doc_DATA) $(pkgconfig_DATA)
+HEADERS = $(nobase_elpa_include_HEADERS) \
+	$(nobase_nodist_elpa_include_HEADERS)
 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) \
 	$(LISP)config.h.in
 # Read a list of newline-separated strings from the standard input,
@@ -830,15 +6936,6 @@
   bases=`for i in $$bases; do echo $$i; done | sed 's/\.log$$//'`; \
   bases=`echo $$bases`
 RECHECK_LOGS = $(TEST_LOGS)
-am__EXEEXT_2 = elpa1_test_real@SUFFIX@.sh \
-	elpa1_test_real_with_c@SUFFIX@.sh elpa2_test_real@SUFFIX@.sh \
-	elpa2_test_real_default_kernel@SUFFIX@.sh \
-	elpa1_test_complex@SUFFIX@.sh elpa2_test_complex@SUFFIX@.sh \
-	elpa2_test_complex_default_kernel@SUFFIX@.sh \
-	elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh \
-	elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \
-	elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \
-	elpa2_print_kernels@SUFFIX@$(EXEEXT) $(am__append_23)
 TEST_SUITE_LOG = test-suite.log
 TEST_EXTENSIONS = @EXEEXT@ .test
 LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver
@@ -862,8 +6959,12 @@
 am__DIST_COMMON = $(dist_man_MANS) $(srcdir)/Doxyfile.in \
 	$(srcdir)/Makefile.in $(srcdir)/config.h.in \
 	$(srcdir)/doxygen.am $(srcdir)/elpa.pc.in \
-	$(srcdir)/generated_headers.am ar-lib compile config.guess \
-	config.sub depcomp install-sh ltmain.sh missing test-driver
+	$(srcdir)/generated_headers.am $(srcdir)/test_programs.am \
+	$(top_srcdir)/elpa/elpa_build_config.h.in \
+	$(top_srcdir)/elpa/elpa_constants.h.in \
+	$(top_srcdir)/elpa/elpa_version.h.in ar-lib compile \
+	config.guess config.sub depcomp install-sh ltmain.sh missing \
+	py-compile test-driver
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -882,6 +6983,7 @@
   | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$'
 distcleancheck_listfiles = find . -type f -print
 ACLOCAL = @ACLOCAL@
+ACTUAL_FC = @ACTUAL_FC@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -897,6 +6999,9 @@
 CFLAGS = @CFLAGS@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
+CPUEXT_FLAGS = @CPUEXT_FLAGS@
+CURRENT_API_VERSION = @CURRENT_API_VERSION@
+CURRENT_AUTOTUNE_VERSION = @CURRENT_AUTOTUNE_VERSION@
 CYGPATH_W = @CYGPATH_W@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
@@ -932,10 +7037,56 @@
 ECHO_N = @ECHO_N@
 ECHO_T = @ECHO_T@
 EGREP = @EGREP@
+ELPA_2STAGE_COMPLEX_AVX2_BLOCK1_COMPILED = @ELPA_2STAGE_COMPLEX_AVX2_BLOCK1_COMPILED@
+ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED = @ELPA_2STAGE_COMPLEX_AVX2_BLOCK2_COMPILED@
+ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED = @ELPA_2STAGE_COMPLEX_AVX512_BLOCK1_COMPILED@
+ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED = @ELPA_2STAGE_COMPLEX_AVX512_BLOCK2_COMPILED@
+ELPA_2STAGE_COMPLEX_AVX_BLOCK1_COMPILED = @ELPA_2STAGE_COMPLEX_AVX_BLOCK1_COMPILED@
+ELPA_2STAGE_COMPLEX_AVX_BLOCK2_COMPILED = @ELPA_2STAGE_COMPLEX_AVX_BLOCK2_COMPILED@
+ELPA_2STAGE_COMPLEX_BGP_COMPILED = @ELPA_2STAGE_COMPLEX_BGP_COMPILED@
+ELPA_2STAGE_COMPLEX_BGQ_COMPILED = @ELPA_2STAGE_COMPLEX_BGQ_COMPILED@
+ELPA_2STAGE_COMPLEX_DEFAULT = @ELPA_2STAGE_COMPLEX_DEFAULT@
+ELPA_2STAGE_COMPLEX_GENERIC_COMPILED = @ELPA_2STAGE_COMPLEX_GENERIC_COMPILED@
+ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE_COMPILED = @ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE_COMPILED@
+ELPA_2STAGE_COMPLEX_GPU_COMPILED = @ELPA_2STAGE_COMPLEX_GPU_COMPILED@
+ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY_COMPILED = @ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY_COMPILED@
+ELPA_2STAGE_COMPLEX_SSE_BLOCK1_COMPILED = @ELPA_2STAGE_COMPLEX_SSE_BLOCK1_COMPILED@
+ELPA_2STAGE_COMPLEX_SSE_BLOCK2_COMPILED = @ELPA_2STAGE_COMPLEX_SSE_BLOCK2_COMPILED@
+ELPA_2STAGE_REAL_AVX2_BLOCK2_COMPILED = @ELPA_2STAGE_REAL_AVX2_BLOCK2_COMPILED@
+ELPA_2STAGE_REAL_AVX2_BLOCK4_COMPILED = @ELPA_2STAGE_REAL_AVX2_BLOCK4_COMPILED@
+ELPA_2STAGE_REAL_AVX2_BLOCK6_COMPILED = @ELPA_2STAGE_REAL_AVX2_BLOCK6_COMPILED@
+ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED = @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@
+ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED = @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@
+ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED = @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@
+ELPA_2STAGE_REAL_AVX_BLOCK2_COMPILED = @ELPA_2STAGE_REAL_AVX_BLOCK2_COMPILED@
+ELPA_2STAGE_REAL_AVX_BLOCK4_COMPILED = @ELPA_2STAGE_REAL_AVX_BLOCK4_COMPILED@
+ELPA_2STAGE_REAL_AVX_BLOCK6_COMPILED = @ELPA_2STAGE_REAL_AVX_BLOCK6_COMPILED@
+ELPA_2STAGE_REAL_BGP_COMPILED = @ELPA_2STAGE_REAL_BGP_COMPILED@
+ELPA_2STAGE_REAL_BGQ_COMPILED = @ELPA_2STAGE_REAL_BGQ_COMPILED@
+ELPA_2STAGE_REAL_DEFAULT = @ELPA_2STAGE_REAL_DEFAULT@
+ELPA_2STAGE_REAL_GENERIC_COMPILED = @ELPA_2STAGE_REAL_GENERIC_COMPILED@
+ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED = @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@
+ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED = @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@
+ELPA_2STAGE_REAL_GENERIC_SIMPLE_COMPILED = @ELPA_2STAGE_REAL_GENERIC_SIMPLE_COMPILED@
+ELPA_2STAGE_REAL_GPU_COMPILED = @ELPA_2STAGE_REAL_GPU_COMPILED@
+ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED = @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@
+ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED = @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@
+ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED = @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@
+ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED = @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@
+ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED = @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@
+ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED = @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@
+ELPA_2STAGE_REAL_SSE_ASSEMBLY_COMPILED = @ELPA_2STAGE_REAL_SSE_ASSEMBLY_COMPILED@
+ELPA_2STAGE_REAL_SSE_BLOCK2_COMPILED = @ELPA_2STAGE_REAL_SSE_BLOCK2_COMPILED@
+ELPA_2STAGE_REAL_SSE_BLOCK4_COMPILED = @ELPA_2STAGE_REAL_SSE_BLOCK4_COMPILED@
+ELPA_2STAGE_REAL_SSE_BLOCK6_COMPILED = @ELPA_2STAGE_REAL_SSE_BLOCK6_COMPILED@
+ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED = @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@
+ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED = @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@
+ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED = @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@
 ELPA_SO_VERSION = @ELPA_SO_VERSION@
 EXEEXT = @EXEEXT@
 FC = @FC@
 FCFLAGS = @FCFLAGS@
+FCFLAGS_F90 = @FCFLAGS_F90@
 FCLIBS = @FCLIBS@
 FC_MODINC = @FC_MODINC@
 FC_MODOUT = @FC_MODOUT@
@@ -955,12 +7106,15 @@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
 LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MKDIR_P = @MKDIR_P@
+MPI_BINARY = @MPI_BINARY@
 NM = @NM@
 NMEDIT = @NMEDIT@
+NUMPY_INCLUDE = @NUMPY_INCLUDE@
+NVCC = @NVCC@
+NVCCFLAGS = @NVCCFLAGS@
 OBJDUMP = @OBJDUMP@
 OBJEXT = @OBJEXT@
 OPENMP_CFLAGS = @OPENMP_CFLAGS@
@@ -977,12 +7131,20 @@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
 PKG_CONFIG_FILE = @PKG_CONFIG_FILE@
+PYTHON = @PYTHON@
+PYTHON_CONFIG = @PYTHON_CONFIG@
+PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@
+PYTHON_INCLUDE = @PYTHON_INCLUDE@
+PYTHON_PLATFORM = @PYTHON_PLATFORM@
+PYTHON_PREFIX = @PYTHON_PREFIX@
+PYTHON_VERSION = @PYTHON_VERSION@
 RANLIB = @RANLIB@
 SCALAPACK_FCFLAGS = @SCALAPACK_FCFLAGS@
 SCALAPACK_LDFLAGS = @SCALAPACK_LDFLAGS@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
+SIMD_FLAGS = @SIMD_FLAGS@
 STRIP = @STRIP@
 SUFFIX = @SUFFIX@
 VERSION = @VERSION@
@@ -1009,6 +7171,7 @@
 build_os = @build_os@
 build_vendor = @build_vendor@
 builddir = @builddir@
+cython_found = @cython_found@
 datadir = @datadir@
 datarootdir = @datarootdir@
 docdir = @docdir@
@@ -1030,11 +7193,17 @@
 localstatedir = @localstatedir@
 mandir = @mandir@
 mkdir_p = @mkdir_p@
+nvcc_found = @nvcc_found@
 oldincludedir = @oldincludedir@
 pdfdir = @pdfdir@
+pkgpyexecdir = @pkgpyexecdir@
+pkgpythondir = @pkgpythondir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+pyexecdir = @pyexecdir@
+pytest_found = @pytest_found@
+pythondir = @pythondir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -1043,162 +7212,4546 @@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-with_amd_bulldozer_kernel = @with_amd_bulldozer_kernel@
+xxd_CHECK = @xxd_CHECK@
 ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
-AM_FCFLAGS = $(SCALAPACK_FCFLAGS) @FC_MODINC@modules @FC_MODOUT@modules
+AM_FCFLAGS = $(SCALAPACK_FCFLAGS) $(FC_MODINC)modules
 AM_LDFLAGS = $(SCALAPACK_LDFLAGS)
+FCLD = @ACTUAL_FC@
 
 # libelpa
 lib_LTLIBRARIES = libelpa@SUFFIX@.la
-libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++
-libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 src/mod_mpi.F90 \
-	src/mod_mpi_stubs.F90 \
-	src/elpa2_kernels/mod_fortran_interfaces.F90 \
-	src/elpa_utilities.F90 src/elpa1_compute.F90 src/elpa1.F90 \
-	src/elpa2_utilities.F90 src/mod_pack_unpack_real.F90 \
-	src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
-	src/mod_compute_hh_trafo_real.F90 \
-	src/mod_compute_hh_trafo_complex.F90 \
-	src/mod_pack_unpack_complex.F90 src/aligned_mem.F90 \
-	src/elpa2_compute.F90 src/elpa2.F90 src/elpa_c_interface.F90 \
-	src/elpa_qr/qr_utils.F90 src/elpa_qr/elpa_qrkernels.f90 \
-	src/elpa_qr/elpa_pdlarfb.F90 src/elpa_qr/elpa_pdgeqrf.F90 \
+libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION)
+libelpa@SUFFIX@_la_LIBADD = libelpa@SUFFIX@_public.la libelpa@SUFFIX@_private.la
+libelpa@SUFFIX@_la_SOURCES = 
+
+# parts with public interface
+
+# internal parts
+
+# library with shared sources for the test files
+noinst_LTLIBRARIES = libelpa@SUFFIX@_public.la \
+	libelpa@SUFFIX@_private.la libelpatest@SUFFIX@.la
+libelpa@SUFFIX@_public_la_FCFLAGS = $(AM_FCFLAGS) $(FC_MODOUT)modules $(FC_MODINC)private_modules
+libelpa@SUFFIX@_public_la_SOURCES = \
+  src/elpa.F90 \
+  src/elpa_api.F90 \
+  src/elpa_constants.F90
+
+libelpa@SUFFIX@_private_la_FCFLAGS = $(AM_FCFLAGS) $(FC_MODOUT)private_modules $(FC_MODINC)private_modules
+libelpa@SUFFIX@_private_la_SOURCES = src/elpa_impl.F90 \
+	src/elpa_autotune_impl.F90 src/elpa_abstract_impl.F90 \
+	src/helpers/mod_precision.F90 \
+	src/helpers/mod_blas_interfaces.F90 \
+	src/helpers/mod_scalapack_interfaces.F90 \
+	src/helpers/mod_mpi.F90 src/helpers/mod_mpi_stubs.F90 \
+	src/helpers/mod_omp.F90 \
+	src/elpa_generated_fortran_interfaces.F90 \
+	src/elpa2/mod_redist_band.F90 \
+	src/elpa2/mod_pack_unpack_cpu.F90 \
+	src/elpa2/mod_compute_hh_trafo.F90 src/helpers/aligned_mem.F90 \
+	src/elpa1/elpa1_compute_private.F90 \
+	src/elpa1/elpa1_auxiliary.F90 \
+	src/elpa2/elpa2_determine_workload.F90 \
+	src/elpa2/elpa2_compute.F90 \
+	src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
+	src/GPU/check_for_gpu.F90 src/GPU/mod_cuda.F90 \
+	src/elpa2/GPU/interface_c_kernel.F90 \
+	src/elpa2/mod_pack_unpack_gpu.F90 src/elpa2/qr/qr_utils.F90 \
+	src/elpa2/qr/elpa_qrkernels.F90 src/elpa2/qr/elpa_pdlarfb.F90 \
+	src/elpa2/qr/elpa_pdgeqrf.F90 src/elpa1/elpa1.F90 \
+	src/elpa2/elpa2.F90 src/elpa_generalized/cannon.c \
+	src/helpers/matrix_plot.F90 \
+	src/general/mod_elpa_skewsymmetric_blas.F90 src/elpa_index.c \
+	src/elpa_c_interface.c src/general/elpa_utilities.F90 \
 	$(am__append_1) $(am__append_2) $(am__append_3) \
-	$(am__append_4) $(am__append_5) $(am__append_6) \
-	$(am__append_7) $(am__append_8) $(am__append_9) \
-	$(am__append_10) $(am__append_11) $(am__append_12) \
-	$(am__append_13) $(am__append_14) $(am__append_15) \
-	$(am__append_16) $(am__append_17) $(am__append_18) \
-	$(am__append_19) $(am__append_20) $(am__append_21)
-EXTRA_libelpa@SUFFIX@_la_DEPENDENCIES = \
-        src/elpa_reduce_add_vectors.X90 \
-        src/elpa_transpose_vectors.X90 \
-        src/redist_band.X90
+	$(am__append_5) $(am__append_6) $(am__append_7) \
+	$(am__append_8) $(am__append_9) $(am__append_10) \
+	$(am__append_11) $(am__append_12) $(am__append_13) \
+	$(am__append_14) $(am__append_15) $(am__append_16) \
+	$(am__append_17) $(am__append_18) $(am__append_19) \
+	$(am__append_20) $(am__append_21) $(am__append_22) \
+	$(am__append_23) $(am__append_24) $(am__append_25) \
+	$(am__append_26) $(am__append_27) $(am__append_28) \
+	$(am__append_29) $(am__append_30) $(am__append_31) \
+	$(am__append_32) $(am__append_33) $(am__append_34) \
+	$(am__append_35) $(am__append_36) $(am__append_37) \
+	$(am__append_38) $(am__append_39) $(am__append_40) \
+	$(am__append_41) $(am__append_42) $(am__append_43) \
+	$(am__append_44) $(am__append_45) $(am__append_46) \
+	$(am__append_47) $(am__append_48) $(am__append_49) \
+	$(am__append_50) $(am__append_51) $(am__append_52) \
+	$(am__append_53) $(am__append_54) $(am__append_55) \
+	$(am__append_56) $(am__append_57) $(am__append_58) \
+	$(am__append_59) $(am__append_60) $(am__append_61) \
+	$(am__append_62) $(am__append_63) $(am__append_64) \
+	$(am__append_65) $(am__append_66) $(am__append_67) \
+	$(am__append_68) $(am__append_69) $(am__append_70) \
+	$(am__append_71) $(am__append_72) $(am__append_73) \
+	$(am__append_74) $(am__append_75)
+EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES =  \
+	src/elpa1/elpa_reduce_add_vectors.F90 \
+	src/elpa1/elpa_transpose_vectors.F90 \
+	src/elpa_api_math_template.F90 src/elpa_impl_math_template.F90 \
+	src/elpa_impl_generalized_transform_template.F90 \
+	src/elpa1/elpa1_compute_template.F90 \
+	src/elpa2/elpa2_compute_real_template.F90 \
+	src/elpa2/elpa2_compute_complex_template.F90 \
+	src/elpa1/elpa1_template.F90 src/elpa2/elpa2_template.F90 \
+	src/elpa2/qr/qr_utils_template.F90 \
+	src/elpa2/qr/elpa_pdlarfb_template.F90 \
+	src/elpa2/qr/elpa_pdgeqrf_template.F90 \
+	src/elpa2/elpa2_bandred_template.F90 \
+	src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 \
+	src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
+	src/elpa2/elpa2_tridiag_band_template.F90 \
+	src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
+	src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90 \
+	src/elpa2/kernels/real_template.F90 \
+	src/elpa2/kernels/complex_template.F90 \
+	src/elpa2/kernels/simple_template.F90 \
+	src/elpa2/kernels/simple_block4_template.F90 \
+	src/elpa2/kernels/simple_block6_template.F90 \
+	src/elpa2/pack_unpack_cpu.F90 src/elpa2/pack_unpack_gpu.F90 \
+	src/elpa2/compute_hh_trafo.F90 src/elpa2/redist_band.F90 \
+	src/general/sanity.F90 src/elpa1/elpa_cholesky_template.F90 \
+	src/elpa1/elpa_invert_trm.F90 src/elpa1/elpa_multiply_a_b.F90 \
+	src/elpa1/elpa_solve_tridi_impl_public.F90 \
+	src/general/elpa_ssr2_template.F90 \
+	src/general/elpa_ssmv_template.F90 \
+	src/general/precision_macros.h \
+	src/general/precision_typedefs.h \
+	src/general/precision_kinds.F90 $(am__append_4)
+
+# Assembly files
+LTCCASCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \
+	$(CCASFLAGS)
+
 
-generated_headers = config-f90.h elpa/elpa_generated.h test/shared_sources/generated.h elpa/elpa_generated_fortran_interfaces.h
+#if OPTIONAL_C_ERROR_ARGUMENT
+#
+#elpa/elpa_generated.h: $(top_srcdir)/src/elpa_impl.F90 \
+#                       $(top_srcdir)/src/elpa_impl_math_template.F90 \
+#                       $(top_srcdir)/src/elpa_api.F90 | elpa
+#	$(call extract_interface,!c_o>)
+#
+#else
+#elpa/elpa_generated.h: $(top_srcdir)/src/elpa_impl.F90 \
+#                       $(top_srcdir)/src/elpa_impl_math_template.F90 \
+#                       $(top_srcdir)/src/elpa_api.F90 | elpa
+#	$(call extract_interface,!c_no>)
+#endif
+generated_headers = config-f90.h elpa/elpa_generated.h \
+	test/shared/generated.h \
+	src/elpa_generated_fortran_interfaces.h \
+	src/elpa_generated_public_fortran_interfaces.h \
+	src/fortran_constants.F90
 BUILT_SOURCES = $(generated_headers)
 
-# install any .mod files in the include/ dir
+# install public headers and Fortran modules files in the include/ dir
 elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
-nobase_elpa_include_HEADERS = $(wildcard modules/*) elpa/elpa.h \
-	elpa/elpa_kernel_constants.h elpa/elpa_generated.h
+nobase_elpa_include_HEADERS = \
+  $(wildcard modules/*) \
+  src/helpers/lapack_interfaces.h \
+  src/helpers/scalapack_interfaces.h \
+  elpa/elpa_simd_constants.h \
+  elpa/elpa.h \
+  elpa/elpa_generic.h
+
+nobase_nodist_elpa_include_HEADERS = \
+  elpa/elpa_version.h \
+  elpa/elpa_constants.h \
+  elpa/elpa_generated.h \
+  elpa/elpa_generated_c_api.h
+
 dist_man_MANS = \
-	   man/solve_evp_real.3 \
-	   man/solve_evp_real_1stage.3 \
-	   man/solve_evp_complex.3 \
-	   man/solve_evp_complex_1stage.3 \
-	   man/solve_evp_real_2stage.3 \
-	   man/solve_evp_complex_2stage.3 \
-	   man/get_elpa_row_col_comms.3 \
-	   man/get_elpa_communicators.3 \
-	   man/elpa2_print_kernels.1
-
-
-# other files to distribute
-filesdir = $(docdir)/examples
-dist_files_DATA = \
-  test/fortran_test_programs/read_real.F90 \
-  test/fortran_test_programs/test_complex2.F90 \
-  test/fortran_test_programs/test_complex2_default_kernel.F90 \
-  test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \
-  test/fortran_test_programs/test_complex.F90 \
-  test/fortran_test_programs/test_real2.F90 \
-  test/fortran_test_programs/test_real2_default_kernel.F90 \
-  test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \
-  test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \
-  test/fortran_test_programs/test_real.F90 \
-  test/fortran_test_programs/test_real_with_c.F90 \
-  src/elpa2_print_kernels.F90
+  man/elpa2_print_kernels.1 \
+  man/elpa_init.3 \
+  man/elpa_allocate.3 \
+  man/elpa_set.3 \
+  man/elpa_setup.3 \
+  man/elpa_eigenvalues.3 \
+  man/elpa_eigenvectors.3 \
+  man/elpa_skew_eigenvalues.3 \
+  man/elpa_skew_eigenvectors.3 \
+  man/elpa_generalized_eigenvectors.3 \
+  man/elpa_generalized_eigenvalues.3 \
+  man/elpa_cholesky.3 \
+  man/elpa_invert_triangular.3 \
+  man/elpa_solve_tridiagonal.3 \
+  man/elpa_hermitian_multiply.3 \
+  man/elpa_deallocate.3 \
+  man/elpa_load_settings.3 \
+  man/elpa_store_settings.3 \
+  man/elpa_print_settings.3 \
+  man/elpa_autotune_save_state.3 \
+  man/elpa_autotune_load_state.3 \
+  man/elpa_autotune_print_state.3 \
+  man/elpa_autotune_setup.3 \
+  man/elpa_autotune_step.3 \
+  man/elpa_autotune_set_best.3 \
+  man/elpa_autotune_deallocate.3 \
+  man/elpa_uninit.3
 
-dist_doc_DATA = README.md USERS_GUIDE.md INSTALL.md CONTRIBUTING.md LICENSE Changelog COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt
+dist_doc_DATA = README.md USERS_GUIDE.md USERS_GUIDE_DEPRECATED_LEGACY_API.md INSTALL.md CONTRIBUTING.md LICENSE Changelog COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt
 
 # pkg-config stuff
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = @PKG_CONFIG_FILE@
-build_lib = libelpa@SUFFIX@.la
-@HAVE_REDIRECT_FALSE@redirect_sources = 
-@HAVE_REDIRECT_TRUE@redirect_sources = test/shared_sources/redir.c test/shared_sources/redirect.F90
-
-#test/shared_sources/mod_precision_created.f90: src/mod_precision.f90
-#	cp $(top_srcdir)/src/mod_precision.f90 $(top_srcdir)/test/shared_sources/mod_precision_created.f90
-shared_sources = test/shared_sources/util.F90 test/shared_sources/read_input_parameters.F90  \
-		 test/shared_sources/check_correctnes.F90 test/shared_sources/setup_mpi.F90 \
-		 test/shared_sources/blacs_infrastructure.F90 test/shared_sources/prepare_matrix.F90 \
-		 test/shared_sources/mod_output_types.F90
-
-@WITH_OPENMP_FALSE@elpa1_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_real_c_version.c $(shared_sources) $(redirect_sources)
-@WITH_OPENMP_FALSE@elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib)
-@WITH_OPENMP_FALSE@elpa1_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-@WITH_OPENMP_FALSE@EXTRA_elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-@WITH_OPENMP_FALSE@elpa1_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_complex_c_version.c $(shared_sources) $(redirect_sources)
-@WITH_OPENMP_FALSE@elpa1_test_complex_c_version@SUFFIX@_LDADD = $(build_lib)
-@WITH_OPENMP_FALSE@elpa1_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-@WITH_OPENMP_FALSE@EXTRA_elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-@WITH_OPENMP_FALSE@elpa2_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_real_c_version.c $(shared_sources) $(redirect_sources)
-@WITH_OPENMP_FALSE@elpa2_test_real_c_version@SUFFIX@_LDADD = $(build_lib)
-@WITH_OPENMP_FALSE@elpa2_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-@WITH_OPENMP_FALSE@EXTRA_elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-@WITH_OPENMP_FALSE@elpa2_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_complex_c_version.c $(shared_sources) $(redirect_sources)
-@WITH_OPENMP_FALSE@elpa2_test_complex_c_version@SUFFIX@_LDADD = $(build_lib)
-@WITH_OPENMP_FALSE@elpa2_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS)
-@WITH_OPENMP_FALSE@EXTRA_elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa1_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real.F90 $(shared_sources) $(redirect_sources)
-elpa1_test_real@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa1_test_real@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa1_test_real_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_real_with_c.F90 test/shared_sources/mod_from_c.F90 \
-					 test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources)
-
-elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-
-#elpa1_test_complex_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex_with_c.F90 test/shared_sources/mod_from_c.F90 test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources)
-#elpa1_test_complex_with_c@SUFFIX@_LDADD = $(build_lib)
-#EXTRA_elpa1_test_complex_with_c@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa2_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2.F90 $(shared_sources) $(redirect_sources)
-elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources)
-elpa2_test_real_default_kernel@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \
-								   $(shared_sources) $(redirect_sources)
-
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \
-							 $(shared_sources) $(redirect_sources)
-
-elpa2_test_real_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa1_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex.F90 $(shared_sources) $(redirect_sources)
-elpa1_test_complex@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa1_test_complex@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa2_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2.F90 $(shared_sources) $(redirect_sources)
-elpa2_test_complex@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_complex@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa2_test_complex_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel.F90 $(shared_sources) $(redirect_sources)
-elpa2_test_complex_default_kernel@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \
-							    $(shared_sources) $(redirect_sources)
-
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib)
-EXTRA_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90
-elpa2_print_kernels@SUFFIX@_SOURCES = src/elpa2_print_kernels.F90 $(shared_sources) $(redirect_sources)
-elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib)
-check_SCRIPTS = elpa1_test_real@SUFFIX@.sh \
-	elpa1_test_real_with_c@SUFFIX@.sh elpa2_test_real@SUFFIX@.sh \
-	elpa2_test_real_default_kernel@SUFFIX@.sh \
-	elpa1_test_complex@SUFFIX@.sh elpa2_test_complex@SUFFIX@.sh \
-	elpa2_test_complex_default_kernel@SUFFIX@.sh \
-	elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh \
-	elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \
-	elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \
-	elpa2_print_kernels@SUFFIX@ $(am__append_23)
-@WITH_MPI_FALSE@wrapper = ""
-
-# test scripts
-@WITH_MPI_TRUE@wrapper = "mpiexec -n 2 "
+check_SCRIPTS = $(am__append_80) $(am__append_83) $(am__append_86) \
+	$(am__append_89) $(am__append_92) $(am__append_95) \
+	$(am__append_98) $(am__append_101) $(am__append_104) \
+	$(am__append_107) $(am__append_110) $(am__append_113) \
+	$(am__append_116) $(am__append_119) $(am__append_122) \
+	$(am__append_125) $(am__append_128) $(am__append_131) \
+	$(am__append_134) \
+	validate_complex_double_eigenvectors_1stage_analytic_default.sh \
+	$(am__append_139) \
+	validate_complex_double_eigenvectors_2stage_all_kernels_analytic_extended.sh \
+	validate_complex_double_eigenvectors_2stage_default_kernel_analytic_default.sh \
+	$(am__append_144) $(am__append_147) $(am__append_150) \
+	$(am__append_153) $(am__append_156) \
+	validate_real_double_eigenvectors_1stage_analytic_default.sh \
+	$(am__append_161) \
+	validate_real_double_eigenvectors_2stage_all_kernels_analytic_extended.sh \
+	validate_real_double_eigenvectors_2stage_default_kernel_analytic_default.sh \
+	$(am__append_166) $(am__append_169) $(am__append_172) \
+	$(am__append_175) $(am__append_178) $(am__append_181) \
+	$(am__append_184) $(am__append_187) $(am__append_190) \
+	$(am__append_193) $(am__append_196) $(am__append_199) \
+	$(am__append_202) \
+	validate_real_double_eigenvalues_1stage_frank_default.sh \
+	$(am__append_207) \
+	validate_real_double_eigenvalues_2stage_default_kernel_frank_default.sh \
+	$(am__append_212) \
+	validate_real_double_eigenvectors_1stage_frank_default.sh \
+	$(am__append_217) \
+	validate_real_double_eigenvectors_2stage_all_kernels_frank_extended.sh \
+	validate_real_double_eigenvectors_2stage_default_kernel_frank_default.sh \
+	$(am__append_222) \
+	validate_real_double_hermitian_multiply_1stage_frank_default.sh \
+	$(am__append_227) $(am__append_230) $(am__append_233) \
+	$(am__append_236) $(am__append_239) $(am__append_242) \
+	$(am__append_245) $(am__append_248) $(am__append_251) \
+	$(am__append_254) $(am__append_257) \
+	validate_complex_double_cholesky_1stage_random_default.sh \
+	$(am__append_262) \
+	validate_real_double_cholesky_1stage_random_default.sh \
+	$(am__append_267) $(am__append_270) $(am__append_273) \
+	$(am__append_276) $(am__append_279) $(am__append_282) \
+	validate_complex_double_eigenvectors_1stage_random_default.sh \
+	$(am__append_287) \
+	validate_complex_double_eigenvectors_2stage_all_kernels_random_extended.sh \
+	validate_complex_double_eigenvectors_2stage_default_kernel_random_default.sh \
+	$(am__append_292) \
+	validate_real_double_eigenvectors_1stage_random_default.sh \
+	$(am__append_297) \
+	validate_real_double_eigenvectors_2stage_all_kernels_random_extended.sh \
+	validate_real_double_eigenvectors_2stage_default_kernel_random_default.sh \
+	$(am__append_302) $(am__append_305) $(am__append_308) \
+	$(am__append_311) $(am__append_314) $(am__append_317) \
+	$(am__append_320) $(am__append_323) $(am__append_326) \
+	$(am__append_329) \
+	validate_complex_double_generalized_1stage_random_default.sh \
+	$(am__append_334) \
+	validate_real_double_generalized_1stage_random_default.sh \
+	$(am__append_339) $(am__append_342) $(am__append_345) \
+	$(am__append_348) $(am__append_351) \
+	validate_complex_double_generalized_decomp_1stage_random_default.sh \
+	$(am__append_356) \
+	validate_real_double_generalized_decomp_1stage_random_default.sh \
+	$(am__append_361) $(am__append_364) $(am__append_367) \
+	$(am__append_370) $(am__append_373) \
+	validate_complex_double_hermitian_multiply_1stage_random_default.sh \
+	$(am__append_378) \
+	validate_real_double_hermitian_multiply_1stage_random_default.sh \
+	$(am__append_383) $(am__append_386) $(am__append_389) \
+	$(am__append_392) $(am__append_395) \
+	validate_real_double_eigenvectors_2stage_all_kernels_qr_random_extended.sh \
+	validate_real_double_eigenvectors_2stage_default_kernel_qr_random_default.sh \
+	$(am__append_400) $(am__append_403) $(am__append_406) \
+	$(am__append_409) $(am__append_412) $(am__append_415) \
+	$(am__append_418) $(am__append_421) $(am__append_424) \
+	$(am__append_427) $(am__append_430) $(am__append_433) \
+	$(am__append_436) $(am__append_439) $(am__append_442) \
+	$(am__append_445) $(am__append_448) $(am__append_451) \
+	$(am__append_454) $(am__append_457) $(am__append_460) \
+	$(am__append_463) $(am__append_466) $(am__append_469) \
+	$(am__append_472) $(am__append_475) $(am__append_478) \
+	$(am__append_481) $(am__append_484) $(am__append_487) \
+	$(am__append_490) $(am__append_493) $(am__append_496) \
+	$(am__append_499) $(am__append_502) $(am__append_505) \
+	$(am__append_508) $(am__append_511) $(am__append_514) \
+	$(am__append_517) $(am__append_520) $(am__append_523) \
+	$(am__append_526) $(am__append_529) $(am__append_532) \
+	$(am__append_535) $(am__append_538) $(am__append_541) \
+	$(am__append_544) $(am__append_547) $(am__append_550) \
+	$(am__append_553) $(am__append_556) \
+	validate_complex_double_cholesky_1stage_toeplitz_default.sh \
+	$(am__append_561) \
+	validate_real_double_cholesky_1stage_toeplitz_default.sh \
+	$(am__append_566) $(am__append_569) $(am__append_572) \
+	$(am__append_575) $(am__append_578) \
+	validate_complex_double_eigenvalues_1stage_toeplitz_default.sh \
+	$(am__append_583) \
+	validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh \
+	$(am__append_588) \
+	validate_real_double_eigenvalues_1stage_toeplitz_default.sh \
+	$(am__append_593) \
+	validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh \
+	$(am__append_598) $(am__append_601) $(am__append_604) \
+	$(am__append_607) $(am__append_610) $(am__append_613) \
+	$(am__append_616) $(am__append_619) $(am__append_622) \
+	validate_complex_double_eigenvectors_1stage_toeplitz_default.sh \
+	$(am__append_627) \
+	validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh \
+	validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh \
+	$(am__append_632) \
+	validate_real_double_eigenvectors_1stage_toeplitz_default.sh \
+	$(am__append_637) \
+	validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh \
+	validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh \
+	$(am__append_642) $(am__append_645) $(am__append_648) \
+	$(am__append_651) $(am__append_654) $(am__append_657) \
+	$(am__append_660) $(am__append_663) $(am__append_666) \
+	validate_real_double_solve_tridiagonal_1stage_toeplitz_default.sh \
+	$(am__append_671) $(am__append_674) $(am__append_677) \
+	$(am__append_680) $(am__append_683) $(am__append_686) \
+	$(am__append_689) $(am__append_692) $(am__append_695) \
+	$(am__append_698) $(am__append_701) $(am__append_704) \
+	$(am__append_707) $(am__append_710) $(am__append_713) \
+	$(am__append_716) $(am__append_719) $(am__append_722) \
+	$(am__append_725) $(am__append_728) $(am__append_731) \
+	$(am__append_734) $(am__append_737) $(am__append_740) \
+	$(am__append_743) $(am__append_746) $(am__append_749) \
+	$(am__append_752) $(am__append_755) $(am__append_758) \
+	$(am__append_761) $(am__append_764) $(am__append_767) \
+	$(am__append_770) $(am__append_773) $(am__append_776) \
+	$(am__append_779) $(am__append_782) $(am__append_785) \
+	$(am__append_788) $(am__append_791) $(am__append_794) \
+	$(am__append_797) $(am__append_800) $(am__append_803) \
+	$(am__append_806) $(am__append_807) $(am__append_809) \
+	$(am__append_811) $(am__append_813) $(am__append_815) \
+	$(am__append_817) $(am__append_819) \
+	test_skewsymmetric_real_double_extended.sh $(am__append_821) \
+	$(am__append_823) validate_split_comm_real_double_extended.sh \
+	validate_double_instance@SUFFIX@_default.sh \
+	validate_real_2stage_banded@SUFFIX@_default.sh \
+	validate_complex_2stage_banded@SUFFIX@_default.sh \
+	$(am__append_826) $(am__append_828) $(am__append_829)
+test_program_ldadd = libelpatest@SUFFIX@.la libelpa@SUFFIX@.la
+test_program_fcflags = $(AM_FCFLAGS) $(FC_MODOUT)test_modules $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+libelpatest@SUFFIX@_la_FCFLAGS = $(test_program_fcflags)
+libelpatest@SUFFIX@_la_SOURCES =  \
+	test/shared/tests_variable_definitions.F90 \
+	test/shared/mod_tests_scalapack_interfaces.F90 \
+	test/shared/mod_tests_blas_interfaces.F90 \
+	test/shared/test_util.F90 \
+	test/shared/test_read_input_parameters.F90 \
+	test/shared/test_check_correctness.F90 \
+	test/shared/test_setup_mpi.F90 \
+	test/shared/test_blacs_infrastructure.F90 \
+	test/shared/test_prepare_matrix.F90 \
+	test/shared/test_analytic.F90 test/shared/test_output_type.F90 \
+	$(am__append_76) $(am__append_77)
+elpa2_print_kernels@SUFFIX@_SOURCES = src/elpa2/elpa2_print_kernels.F90
+elpa2_print_kernels@SUFFIX@_LDADD = libelpa@SUFFIX@.la
+elpa2_print_kernels@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)modules
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_eigenvectors_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_eigenvectors_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_eigenvectors_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@  -DTEST_CASE=\"validate_c_version_complex_double_eigenvectors_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@  -DTEST_CASE=\"validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random\" \
+@ENABLE_C_TESTS_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@  -DTEST_SOLVER_2STAGE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_MATRIX_RANDOM \
+@ENABLE_C_TESTS_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_eigenvectors_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_eigenvectors_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_eigenvectors_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@  -DTEST_CASE=\"validate_c_version_real_double_eigenvectors_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@  -DTEST_CASE=\"validate_c_version_real_double_eigenvectors_2stage_default_kernel_random\" \
+@ENABLE_C_TESTS_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@  -DTEST_SOLVER_2STAGE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_MATRIX_RANDOM \
+@ENABLE_C_TESTS_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_eigenvectors_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_eigenvectors_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_eigenvectors_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_c_version_complex_single_eigenvectors_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_2STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_eigenvectors_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_eigenvectors_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_eigenvectors_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_c_version_real_single_eigenvectors_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_c_version_real_single_eigenvectors_2stage_default_kernel_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_generalized_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_generalized_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_generalized_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@  -DTEST_CASE=\"validate_c_version_complex_double_generalized_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_generalized_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_generalized_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_generalized_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@  -DTEST_CASE=\"validate_c_version_real_double_generalized_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_generalized_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_generalized_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_generalized_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_c_version_complex_single_generalized_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_generalized_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_generalized_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_generalized_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_c_version_real_single_generalized_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_generalized_decomp_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_complex_double_generalized_decomp_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@  -DTEST_CASE=\"validate_c_version_complex_double_generalized_decomp_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_generalized_decomp_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@validate_c_version_real_double_generalized_decomp_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@  -DTEST_CASE=\"validate_c_version_real_double_generalized_decomp_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_generalized_decomp_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_c_version_complex_single_generalized_decomp_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_c_version_complex_single_generalized_decomp_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_generalized_decomp_1stage_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_c_version_real_single_generalized_decomp_1stage_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_c_version_real_single_generalized_decomp_1stage_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_eigenvectors_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_eigenvectors_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_complex_double_eigenvectors_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_eigenvectors_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_eigenvectors_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_real_double_eigenvectors_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_eigenvectors_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_eigenvectors_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_complex_single_eigenvectors_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_eigenvectors_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_eigenvectors_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_real_single_eigenvectors_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_generalized_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_generalized_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_complex_double_generalized_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_generalized_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_generalized_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_real_double_generalized_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_generalized_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_generalized_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_complex_single_generalized_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_generalized_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_generalized_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_real_single_generalized_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_complex_double_generalized_decomp_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_generalized_decomp_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_double_generalized_decomp_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_real_double_generalized_decomp_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_complex_single_generalized_decomp_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_generalized_decomp_1stage_gpu_random_SOURCES = test/C/test.c
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_c_version_real_single_generalized_decomp_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_c_version_real_single_generalized_decomp_1stage_gpu_random\" \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_analytic_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_eigenvectors_1stage_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_analytic_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS
+
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts\" \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_SCALAPACK_ALL \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_all_analytic_SOURCES = test/Fortran/test.F90
+@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_all_analytic_LDADD = $(test_program_ldadd)
+@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_all_analytic_FCFLAGS = $(test_program_fcflags) \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_scalapack_all_analytic\" \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_COMPLEX \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_DOUBLE \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_SCALAPACK_ALL \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_GPU=0 \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_MATRIX_ANALYTIC
+
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts\" \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_SCALAPACK_PART \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_part_analytic_SOURCES = test/Fortran/test.F90
+@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_part_analytic_LDADD = $(test_program_ldadd)
+@WITH_SCALAPACK_TESTS_TRUE@validate_complex_double_eigenvectors_scalapack_part_analytic_FCFLAGS = $(test_program_fcflags) \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_scalapack_part_analytic\" \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_COMPLEX \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_DOUBLE \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_SCALAPACK_PART \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_GPU=0 \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_MATRIX_ANALYTIC
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_analytic_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_1stage_analytic_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_analytic_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_analytic\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_analytic\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS
+
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_analytic\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts\" \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_SCALAPACK_ALL \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_all_analytic_SOURCES = test/Fortran/test.F90
+@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_all_analytic_LDADD = $(test_program_ldadd)
+@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_all_analytic_FCFLAGS = $(test_program_fcflags) \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_scalapack_all_analytic\" \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_REAL \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_DOUBLE \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_SCALAPACK_ALL \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_GPU=0 \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_MATRIX_ANALYTIC
+
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts\" \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_SCALAPACK_PART \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WITH_MPI_TRUE@@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_part_analytic_SOURCES = test/Fortran/test.F90
+@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_part_analytic_LDADD = $(test_program_ldadd)
+@WITH_SCALAPACK_TESTS_TRUE@validate_real_double_eigenvectors_scalapack_part_analytic_FCFLAGS = $(test_program_fcflags) \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_scalapack_part_analytic\" \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_REAL \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_DOUBLE \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_SCALAPACK_PART \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_GPU=0 \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_SCALAPACK_TESTS_TRUE@  -DTEST_MATRIX_ANALYTIC
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_analytic_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_analytic_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_analytic_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_analytic_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_analytic\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_ANALYTIC
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_analytic_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_analytic_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_analytic\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_analytic_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_analytic_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_analytic\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_analytic_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_analytic_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_analytic_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_analytic_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_analytic\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_ANALYTIC
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_analytic_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_analytic_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_analytic\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_analytic_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_analytic_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_analytic\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_ANALYTIC \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_frank_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvalues_1stage_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvalues_2stage_default_kernel_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_frank_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_1stage_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_2stage_all_kernels_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_KERNELS
+
+validate_real_double_eigenvectors_2stage_default_kernel_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_frank_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_hermitian_multiply_1stage_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_frank_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_1stage_gpu_frank_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_1stage_gpu_frank_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_1stage_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_gpu_frank\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_FRANK
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_frank_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_frank_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_frank\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_FRANK
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_FRANK \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_frank_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_frank_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_gpu_frank\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_FRANK
+
+@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_cholesky_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_cholesky_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_cholesky_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_random_split_comm_myself_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_cholesky_1stage_random_split_comm_myself\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DSPLIT_COMM_MYSELF
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_cholesky_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_cholesky_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_cholesky_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_cholesky_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_cholesky_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_cholesky_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_cholesky_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_cholesky_1stage_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_eigenvectors_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_eigenvectors_2stage_all_kernels_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+
+validate_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_random_split_comm_myself_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_random_split_comm_myself\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DSPLIT_COMM_MYSELF
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_2stage_all_kernels_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+
+validate_real_double_eigenvectors_2stage_default_kernel_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_MPI_TRUE@  -DSPLIT_COMM_MYSELF
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_MPI_TRUE@validate_complex_double_generalized_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_generalized_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_generalized_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_generalized_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_generalized_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_real_double_generalized_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_generalized_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_generalized_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_generalized_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_generalized_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_generalized_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_generalized_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_generalized_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_generalized_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_generalized_1stage_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_generalized_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_generalized_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_generalized_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_generalized_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_generalized_1stage_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_complex_double_generalized_decomp_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_generalized_decomp_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_generalized_decomp_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_generalized_decomp_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_decomp_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_real_double_generalized_decomp_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_generalized_decomp_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_generalized_decomp_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_generalized_decomp_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_decomp_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_decomp_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_decomp_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_generalized_decomp_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_generalized_decomp_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_generalized_decomp_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_generalized_decomp_1stage_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_decomp_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_decomp_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_generalized_decomp_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_generalized_decomp_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_generalized_decomp_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_generalized_decomp_1stage_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_complex_double_hermitian_multiply_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_hermitian_multiply_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_hermitian_multiply_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_hermitian_multiply_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_hermitian_multiply_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_hermitian_multiply_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_hermitian_multiply_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_hermitian_multiply_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_hermitian_multiply_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_hermitian_multiply_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_hermitian_multiply_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_hermitian_multiply_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_hermitian_multiply_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_hermitian_multiply_1stage_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_hermitian_multiply_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_hermitian_multiply_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_hermitian_multiply_1stage_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_hermitian_multiply_1stage_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_hermitian_multiply_1stage_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_hermitian_multiply_1stage_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_hermitian_multiply_1stage_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=1 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=1 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_qr_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_qr_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_qr_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_qr_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_qr_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_qr_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_qr_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_qr_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_cholesky_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_cholesky_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_cholesky_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_cholesky_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_cholesky_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_cholesky_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_random_split_comm_myself\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DSPLIT_COMM_MYSELF
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_cholesky_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_cholesky_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_cholesky_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_cholesky_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_cholesky_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_cholesky_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_cholesky_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_cholesky_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DSPLIT_COMM_MYSELF
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DSPLIT_COMM_MYSELF
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_gpu_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_gpu_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_generalized_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_generalized_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_generalized_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_generalized_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_generalized_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_generalized_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_generalized_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_generalized_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_generalized_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_generalized_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_generalized_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_generalized_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_generalized_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_generalized_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_generalized_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_generalized_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_generalized_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_generalized_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_generalized_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_generalized_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_generalized_decomp_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_generalized_decomp_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_generalized_decomp_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_generalized_decomp_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_generalized_decomp_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_generalized_decomp_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_generalized_decomp_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_generalized_decomp_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_generalized_decomp_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_generalized_decomp_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_generalized_decomp_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_generalized_decomp_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_hermitian_multiply_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_hermitian_multiply_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_hermitian_multiply_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_hermitian_multiply_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_hermitian_multiply_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_gpu_random\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_hermitian_multiply_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_hermitian_multiply_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_hermitian_multiply_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_hermitian_multiply_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_RANDOM \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_hermitian_multiply_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_hermitian_multiply_1stage_gpu_random_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_hermitian_multiply_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_hermitian_multiply_1stage_gpu_random\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_HERMITIAN_MULTIPLY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_RANDOM
+
+@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_cholesky_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_cholesky_1stage_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_cholesky_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_cholesky_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_cholesky_1stage_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_cholesky_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_cholesky_1stage_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_cholesky_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_cholesky_1stage_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_cholesky_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_cholesky_1stage_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_eigenvalues_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvalues_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvalues_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvalues_1stage_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvalues_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvalues_1stage_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvalues_1stage_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvalues_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvalues_1stage_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvalues_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvalues_1stage_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvalues_2stage_default_kernel_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_eigenvectors_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_MPI_TRUE@validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_MPI_TRUE@validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_MPI_TRUE@validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts\" \
+@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_MPI_TRUE@  -DTEST_SOLVE_TRIDIAGONAL \
+@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+validate_real_double_solve_tridiagonal_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_solve_tridiagonal_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_solve_tridiagonal_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVE_TRIDIAGONAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_solve_tridiagonal_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_solve_tridiagonal_1stage_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_real_single_solve_tridiagonal_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_CASE=\"validate_real_single_solve_tridiagonal_1stage_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVE_TRIDIAGONAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_GPU=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_cholesky_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_cholesky_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_cholesky_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_cholesky_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_cholesky_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_cholesky_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CHOLESKY \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_cholesky_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_cholesky_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_cholesky_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_cholesky_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_cholesky_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_cholesky_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_cholesky_1stage_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CHOLESKY \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvalues_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvalues_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvalues_1stage_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvalues_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvalues_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvalues_1stage_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvalues_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvalues_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvalues_1stage_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVALUES \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_COMPLEX \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_KERNELS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_ALL_KERNELS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_EIGENVECTORS \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_2STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts\" \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVE_TRIDIAGONAL \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WITH_GPU_VERSION_TRUE@validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WITH_GPU_VERSION_TRUE@validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WITH_GPU_VERSION_TRUE@validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz\" \
+@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_DOUBLE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVE_TRIDIAGONAL \
+@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_CASE=\"validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVE_TRIDIAGONAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_MATRIX_TOEPLITZ \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@@WITH_MPI_TRUE@  -DTEST_ALL_LAYOUTS
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_CASE=\"validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz\" \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SINGLE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVE_TRIDIAGONAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_SOLVER_1STAGE \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_GPU=1 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_QR_DECOMPOSITION=0 \
+@WANT_SINGLE_PRECISION_REAL_TRUE@@WITH_GPU_VERSION_TRUE@  -DTEST_MATRIX_TOEPLITZ
+
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_autotune_c_version_complex_double_SOURCES = test/C/test_autotune.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_autotune_c_version_complex_double_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_autotune_c_version_complex_double_CFLAGS = $(test_program_cflags) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@  -DTEST_COMPLEX \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE
+
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_autotune_c_version_real_double_SOURCES = test/C/test_autotune.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_autotune_c_version_real_double_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_autotune_c_version_real_double_CFLAGS = $(test_program_cflags) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@  -DTEST_REAL \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE
+
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_autotune_c_version_complex_single_SOURCES = test/C/test_autotune.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_autotune_c_version_complex_single_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_autotune_c_version_complex_single_CFLAGS = $(test_program_cflags) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE
+
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_autotune_c_version_real_single_SOURCES = test/C/test_autotune.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_autotune_c_version_real_single_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_autotune_c_version_real_single_CFLAGS = $(test_program_cflags) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE
+
+@ENABLE_AUTOTUNING_TRUE@validate_autotune_complex_double_SOURCES = test/Fortran/test_autotune.F90
+@ENABLE_AUTOTUNING_TRUE@validate_autotune_complex_double_LDADD = $(test_program_ldadd)
+@ENABLE_AUTOTUNING_TRUE@validate_autotune_complex_double_FCFLAGS = $(test_program_fcflags) \
+@ENABLE_AUTOTUNING_TRUE@  -DTEST_COMPLEX \
+@ENABLE_AUTOTUNING_TRUE@  -DTEST_DOUBLE
+
+@ENABLE_AUTOTUNING_TRUE@validate_autotune_real_double_SOURCES = test/Fortran/test_autotune.F90
+@ENABLE_AUTOTUNING_TRUE@validate_autotune_real_double_LDADD = $(test_program_ldadd)
+@ENABLE_AUTOTUNING_TRUE@validate_autotune_real_double_FCFLAGS = $(test_program_fcflags) \
+@ENABLE_AUTOTUNING_TRUE@  -DTEST_REAL \
+@ENABLE_AUTOTUNING_TRUE@  -DTEST_DOUBLE
+
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_autotune_complex_single_SOURCES = test/Fortran/test_autotune.F90
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_autotune_complex_single_LDADD = $(test_program_ldadd)
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_autotune_complex_single_FCFLAGS = $(test_program_fcflags) \
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_COMPLEX \
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_COMPLEX_TRUE@  -DTEST_SINGLE
+
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_autotune_real_single_SOURCES = test/Fortran/test_autotune.F90
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_autotune_real_single_LDADD = $(test_program_ldadd)
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@validate_autotune_real_single_FCFLAGS = $(test_program_fcflags) \
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@ENABLE_AUTOTUNING_TRUE@@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE
+
+@ENABLE_AUTOTUNING_TRUE@validate_multiple_objs_real_double_SOURCES = test/Fortran/test_multiple_objs.F90
+@ENABLE_AUTOTUNING_TRUE@validate_multiple_objs_real_double_LDADD = $(test_program_ldadd)
+@ENABLE_AUTOTUNING_TRUE@validate_multiple_objs_real_double_FCFLAGS = $(test_program_fcflags) \
+@ENABLE_AUTOTUNING_TRUE@  -DTEST_REAL \
+@ENABLE_AUTOTUNING_TRUE@  -DTEST_DOUBLE
+
+test_skewsymmetric_real_double_SOURCES = test/Fortran/test_skewsymmetric.F90
+test_skewsymmetric_real_double_LDADD = $(test_program_ldadd)
+test_skewsymmetric_real_double_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_REAL \
+  -DTEST_DOUBLE
+
+@WANT_SINGLE_PRECISION_REAL_TRUE@test_skewsymmetric_real_single_SOURCES = test/Fortran/test_skewsymmetric.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@test_skewsymmetric_real_single_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@test_skewsymmetric_real_single_FCFLAGS = $(test_program_fcflags) \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_REAL \
+@WANT_SINGLE_PRECISION_REAL_TRUE@  -DTEST_SINGLE
+
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_multiple_objs_real_double_c_version_SOURCES = test/C/test_multiple_objs.c
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_multiple_objs_real_double_c_version_LDADD = $(test_program_ldadd) $(FCLIBS)
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@validate_multiple_objs_real_double_c_version_CFLAGS = $(test_program_cflags) \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@  -DTEST_REAL \
+@ENABLE_AUTOTUNING_TRUE@@ENABLE_C_TESTS_TRUE@  -DTEST_DOUBLE
+
+validate_split_comm_real_double_SOURCES = test/Fortran/test_split_comm.F90
+validate_split_comm_real_double_LDADD = $(test_program_ldadd)
+validate_split_comm_real_double_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_REAL \
+  -DTEST_DOUBLE
+
+validate_double_instance@SUFFIX@_SOURCES = test/Fortran/elpa2/double_instance.F90
+validate_double_instance@SUFFIX@_LDADD = $(test_program_ldadd)
+validate_double_instance@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+validate_real_2stage_banded@SUFFIX@_SOURCES = test/Fortran/elpa2/real_2stage_banded.F90 
+validate_real_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
+validate_real_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+validate_complex_2stage_banded@SUFFIX@_SOURCES = test/Fortran/elpa2/complex_2stage_banded.F90
+validate_complex_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
+validate_complex_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_single_real_2stage_banded@SUFFIX@_SOURCES = test/Fortran/elpa2/single_real_2stage_banded.F90
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_single_real_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_REAL_TRUE@validate_single_real_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_single_complex_2stage_banded@SUFFIX@_SOURCES = test/Fortran/elpa2/single_complex_2stage_banded.F90
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_single_complex_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
+@WANT_SINGLE_PRECISION_COMPLEX_TRUE@validate_single_complex_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+
+# python wrapper
+pyelpadir = $(pythondir)/pyelpa
+@WITH_PYTHON_FALSE@pyelpa_PYTHON = 
+@WITH_PYTHON_TRUE@pyelpa_PYTHON = python/pyelpa/__init__.py python/pyelpa/distributedmatrix.py
+@WITH_PYTHON_FALSE@pyelpa_LTLIBRARIES = 
+@WITH_PYTHON_TRUE@pyelpa_LTLIBRARIES = wrapper.la
+nodist_wrapper_la_SOURCES = python/pyelpa/wrapper.c
+wrapper_la_LDFLAGS = -module -avoid-version -shared $(AM_LDFLAGS)
+wrapper_la_LIBADD = libelpa@SUFFIX@.la
+wrapper_la_CFLAGS = $(PYTHON_INCLUDE) $(NUMPY_INCLUDE) $(AM_CFLAGS)
+@WITH_MPI_FALSE@wrapper = 
+@WITH_MPI_TRUE@wrapper = $(MPI_BINARY) -n $${TASKS:-$(TASKS)}
+TESTS = $(check_SCRIPTS)
 @DX_COND_doc_TRUE@@DX_COND_html_TRUE@DX_CLEAN_HTML = @DX_DOCDIR@/html
 @DX_COND_chm_TRUE@@DX_COND_doc_TRUE@DX_CLEAN_CHM = @DX_DOCDIR@/chm
 @DX_COND_chi_TRUE@@DX_COND_chm_TRUE@@DX_COND_doc_TRUE@DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi
@@ -1224,29 +11777,105 @@
 @DX_COND_doc_TRUE@	$(DX_CLEAN_LATEX)
 
 CLEANFILES = \
-  elpa-generated.h \
+  elpa_generated.h \
+  elpa_generated_c_api.h \
   elpa1_test* \
   elpa2_test*\
-  *.i
-
-EXTRA_DIST = \
-  fdep/fortran_dependencies.pl \
-  fdep/fortran_dependencies.mk \
-  test/fortran_test_programs/elpa_test_programs_print_headers.X90 \
-  src/elpa_reduce_add_vectors.X90 \
-  src/elpa_transpose_vectors.X90 \
-  src/redist_band.X90 \
-  elpa.spec
-
+  elpa2_real* \
+  elpa1_real* \
+  elpa*.sh \
+  test*.sh \
+  single_real* \
+  single_complex* \
+  real* \
+  complex* \
+  double_instance* \
+  *.i \
+  python/pyelpa/wrapper.c \
+  check_python.sh
+
+
+# python wrapper files
+EXTRA_DIST = elpa.spec elpa/elpa.h elpa/elpa_generic.h \
+	fdep/fortran_dependencies.mk fdep/fortran_dependencies.pl \
+	manual_cpp nvcc_wrap remove_xcompiler \
+	src/helpers/fortran_blas_interfaces.F90 \
+	src/helpers/fortran_scalapack_interfaces.F90 \
+	src/GPU/cuUtils_template.cu src/elpa_api_math_template.F90 \
+	src/elpa_impl_math_template.F90 \
+	src/elpa_impl_generalized_transform_template.F90 \
+	src/elpa1/elpa1_compute_template.F90 \
+	src/elpa1/elpa1_merge_systems_real_template.F90 \
+	src/elpa1/elpa1_solve_tridi_real_template.F90 \
+	src/elpa1/elpa1_template.F90 \
+	src/elpa1/elpa1_tools_template.F90 \
+	src/elpa1/elpa1_trans_ev_template.F90 \
+	src/elpa1/elpa1_tridiag_template.F90 \
+	src/elpa1/elpa_cholesky_template.F90 \
+	src/elpa1/elpa_invert_trm.F90 src/elpa1/elpa_multiply_a_b.F90 \
+	src/elpa1/elpa_reduce_add_vectors.F90 \
+	src/elpa1/elpa_solve_tridi_impl_public.F90 \
+	src/elpa1/elpa_transpose_vectors.F90 \
+	src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu \
+	src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu \
+	src/elpa2/compute_hh_trafo.F90 \
+	src/elpa2/elpa2_bandred_template.F90 \
+	src/elpa2/elpa2_compute_complex_template.F90 \
+	src/elpa2/elpa2_compute_real_template.F90 \
+	src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90 \
+	src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 \
+	src/elpa2/elpa2_template.F90 \
+	src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
+	src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
+	src/elpa2/elpa2_tridiag_band_template.F90 \
+	src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
+	src/elpa2/kernels/complex_template.F90 \
+	src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
+	src/elpa2/kernels/real_template.F90 \
+	src/elpa2/kernels/simple_template.F90 \
+	src/elpa2/kernels/simple_block4_template.F90 \
+	src/elpa2/kernels/simple_block6_template.F90 \
+	src/elpa2/pack_unpack_cpu.F90 src/elpa2/pack_unpack_gpu.F90 \
+	src/elpa2/qr/elpa_pdgeqrf_template.F90 \
+	src/elpa2/qr/elpa_pdlarfb_template.F90 \
+	src/elpa2/qr/elpa_qrkernels_template.F90 \
+	src/elpa2/qr/qr_utils_template.F90 src/elpa2/redist_band.F90 \
+	src/elpa_generalized/cannon_forw_template.c \
+	src/elpa_generalized/cannon_back_template.c src/elpa_index.h \
+	src/fortran_constants.h src/general/map_global_to_local.F90 \
+	src/general/precision_macros.h \
+	src/general/precision_typedefs.h \
+	src/general/precision_kinds.F90 \
+	test/shared/test_precision_kinds.F90 src/general/prow_pcol.F90 \
+	src/general/sanity.F90 src/general/elpa_ssr2_template.F90 \
+	src/general/elpa_ssmv_template.F90 test/Fortran/assert.h \
+	test/Fortran/elpa_print_headers.F90 \
+	test/shared/test_check_correctness_template.F90 \
+	test/shared/test_prepare_matrix_template.F90 \
+	test/shared/test_analytic_template.F90 \
+	test_project_1stage/Makefile.am test_project_1stage/autogen.sh \
+	test_project_1stage/configure.ac test_project_1stage/fdep \
+	test_project_1stage/m4 test_project_1stage/src/test_real.F90 \
+	test_project_2stage/Makefile.am test_project_2stage/autogen.sh \
+	test_project_2stage/configure.ac test_project_2stage/fdep \
+	test_project_2stage/m4 test_project_2stage/src/test_real2.F90 \
+	test_project_C/Makefile.am test_project_C/autogen.sh \
+	test_project_C/configure.ac test_project_C/fdep \
+	test_project_C/m4 test_project_C/src/test_real.c \
+	test_project_C/src/test_blacs_infrastructure.F90 \
+	$(am__append_830) python/pyelpa/__init__.py \
+	python/pyelpa/distributedmatrix.py python/pyelpa/wrapper.pyx \
+	python/tests/test_elpa_import.py python/tests/test_mpi4py.py \
+	python/tests/test_numroc.py python/tests/test_with_mpi.py
 LIBTOOL_DEPS = @LIBTOOL_DEPS@
 all: $(BUILT_SOURCES) config.h
 	$(MAKE) $(AM_MAKEFLAGS) all-am
 
 .SUFFIXES:
-.SUFFIXES: .F90 .c .f90 .lo .log .o .obj .s .test .test$(EXEEXT) .trs
+.SUFFIXES: .F90 .c .cu .f90 .lo .log .o .obj .s .test .test$(EXEEXT) .trs
 am--refresh: Makefile
 	@:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/generated_headers.am $(srcdir)/doxygen.am $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/generated_headers.am $(srcdir)/test_programs.am $(srcdir)/doxygen.am $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -1265,17 +11894,17 @@
 	    echo ' $(SHELL) ./config.status'; \
 	    $(SHELL) ./config.status;; \
 	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__maybe_remake_depfiles)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__maybe_remake_depfiles);; \
 	esac;
-$(srcdir)/generated_headers.am $(srcdir)/doxygen.am $(am__empty):
+$(srcdir)/generated_headers.am $(srcdir)/test_programs.am $(srcdir)/doxygen.am $(am__empty):
 
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	$(SHELL) ./config.status --recheck
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	$(am__cd) $(srcdir) && $(AUTOCONF)
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	$(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
 $(am__aclocal_m4_deps):
 
@@ -1286,7 +11915,7 @@
 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
 	@rm -f stamp-h1
 	cd $(top_builddir) && $(SHELL) ./config.status config.h
-$(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) 
+$(srcdir)/config.h.in:  $(am__configure_deps) 
 	($(am__cd) $(top_srcdir) && $(AUTOHEADER))
 	rm -f stamp-h1
 	touch $@
@@ -1297,6 +11926,70 @@
 	cd $(top_builddir) && $(SHELL) ./config.status $@
 ${PKG_CONFIG_FILE}: $(top_builddir)/config.status $(srcdir)/elpa.pc.in
 	cd $(top_builddir) && $(SHELL) ./config.status $@
+elpa/elpa_constants.h: $(top_builddir)/config.status $(top_srcdir)/elpa/elpa_constants.h.in
+	cd $(top_builddir) && $(SHELL) ./config.status $@
+elpa/elpa_version.h: $(top_builddir)/config.status $(top_srcdir)/elpa/elpa_version.h.in
+	cd $(top_builddir) && $(SHELL) ./config.status $@
+elpa/elpa_build_config.h: $(top_builddir)/config.status $(top_srcdir)/elpa/elpa_build_config.h.in
+	cd $(top_builddir) && $(SHELL) ./config.status $@
+install-binPROGRAMS: $(bin_PROGRAMS)
+	@$(NORMAL_INSTALL)
+	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \
+	fi; \
+	for p in $$list; do echo "$$p $$p"; done | \
+	sed 's/$(EXEEXT)$$//' | \
+	while read p p1; do if test -f $$p \
+	 || test -f $$p1 \
+	  ; then echo "$$p"; echo "$$p"; else :; fi; \
+	done | \
+	sed -e 'p;s,.*/,,;n;h' \
+	    -e 's|.*|.|' \
+	    -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \
+	sed 'N;N;N;s,\n, ,g' | \
+	$(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \
+	  { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
+	    if ($$2 == $$4) files[d] = files[d] " " $$1; \
+	    else { print "f", $$3 "/" $$4, $$1; } } \
+	  END { for (d in files) print "f", d, files[d] }' | \
+	while read type dir files; do \
+	    if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
+	    test -z "$$files" || { \
+	    echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \
+	    $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \
+	    } \
+	; done
+
+uninstall-binPROGRAMS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
+	files=`for p in $$list; do echo "$$p"; done | \
+	  sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \
+	      -e 's/$$/$(EXEEXT)/' \
+	`; \
+	test -n "$$list" || exit 0; \
+	echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \
+	cd "$(DESTDIR)$(bindir)" && rm -f $$files
+
+clean-binPROGRAMS:
+	@list='$(bin_PROGRAMS)'; test -n "$$list" || exit 0; \
+	echo " rm -f" $$list; \
+	rm -f $$list || exit $$?; \
+	test -n "$(EXEEXT)" || exit 0; \
+	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
+	echo " rm -f" $$list; \
+	rm -f $$list
+
+clean-noinstPROGRAMS:
+	@list='$(noinst_PROGRAMS)'; test -n "$$list" || exit 0; \
+	echo " rm -f" $$list; \
+	rm -f $$list || exit $$?; \
+	test -n "$(EXEEXT)" || exit 0; \
+	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
+	echo " rm -f" $$list; \
+	rm -f $$list
 
 install-libLTLIBRARIES: $(lib_LTLIBRARIES)
 	@$(NORMAL_INSTALL)
@@ -1332,76 +12025,214 @@
 	  echo rm -f $${locs}; \
 	  rm -f $${locs}; \
 	}
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+install-pyelpaLTLIBRARIES: $(pyelpa_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	@list='$(pyelpa_LTLIBRARIES)'; test -n "$(pyelpadir)" || list=; \
+	list2=; for p in $$list; do \
+	  if test -f $$p; then \
+	    list2="$$list2 $$p"; \
+	  else :; fi; \
+	done; \
+	test -z "$$list2" || { \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(pyelpadir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(pyelpadir)" || exit 1; \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(pyelpadir)'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(pyelpadir)"; \
+	}
+
+uninstall-pyelpaLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(pyelpa_LTLIBRARIES)'; test -n "$(pyelpadir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(pyelpadir)/$$f'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(pyelpadir)/$$f"; \
+	done
+
+clean-pyelpaLTLIBRARIES:
+	-test -z "$(pyelpa_LTLIBRARIES)" || rm -f $(pyelpa_LTLIBRARIES)
+	@list='$(pyelpa_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+libelpa@SUFFIX@.la: $(libelpa@SUFFIX@_la_OBJECTS) $(libelpa@SUFFIX@_la_DEPENDENCIES) $(EXTRA_libelpa@SUFFIX@_la_DEPENDENCIES) 
+	$(AM_V_GEN)$(libelpa@SUFFIX@_la_LINK) -rpath $(libdir) $(libelpa@SUFFIX@_la_OBJECTS) $(libelpa@SUFFIX@_la_LIBADD) $(LIBS)
 src/$(am__dirstamp):
 	@$(MKDIR_P) src
 	@: > src/$(am__dirstamp)
 src/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) src/$(DEPDIR)
 	@: > src/$(DEPDIR)/$(am__dirstamp)
-src/mod_precision.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/mod_mpi.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
-src/mod_mpi_stubs.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/$(am__dirstamp):
-	@$(MKDIR_P) src/elpa2_kernels
-	@: > src/elpa2_kernels/$(am__dirstamp)
-src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) src/elpa2_kernels/$(DEPDIR)
-	@: > src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/mod_fortran_interfaces.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa_utilities.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/elpa1_compute.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/elpa1.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_utilities.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/mod_pack_unpack_real.lo: src/$(am__dirstamp) \
+src/libelpa@SUFFIX@_private_la-elpa_impl.lo: src/$(am__dirstamp) \
 	src/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/mod_single_hh_trafo_real.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/mod_compute_hh_trafo_real.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/mod_compute_hh_trafo_complex.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/mod_pack_unpack_complex.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/aligned_mem.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_compute.lo: src/$(am__dirstamp) \
-	src/$(DEPDIR)/$(am__dirstamp)
-src/elpa2.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/libelpa@SUFFIX@_private_la-elpa_autotune_impl.lo:  \
+	src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/libelpa@SUFFIX@_private_la-elpa_abstract_impl.lo:  \
+	src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/helpers/$(am__dirstamp):
+	@$(MKDIR_P) src/helpers
+	@: > src/helpers/$(am__dirstamp)
+src/helpers/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/helpers/$(DEPDIR)
+	@: > src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-mod_precision.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-mod_blas_interfaces.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-mod_scalapack_interfaces.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-mod_mpi.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-mod_mpi_stubs.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-mod_omp.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/libelpa@SUFFIX@_private_la-elpa_generated_fortran_interfaces.lo:  \
+	src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa2
+	@: > src/elpa2/$(am__dirstamp)
+src/elpa2/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa2/$(DEPDIR)
+	@: > src/elpa2/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/libelpa@SUFFIX@_private_la-mod_redist_band.lo:  \
+	src/elpa2/$(am__dirstamp) src/elpa2/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/libelpa@SUFFIX@_private_la-mod_pack_unpack_cpu.lo:  \
+	src/elpa2/$(am__dirstamp) src/elpa2/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/libelpa@SUFFIX@_private_la-mod_compute_hh_trafo.lo:  \
+	src/elpa2/$(am__dirstamp) src/elpa2/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-aligned_mem.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/elpa1/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa1
+	@: > src/elpa1/$(am__dirstamp)
+src/elpa1/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa1/$(DEPDIR)
+	@: > src/elpa1/$(DEPDIR)/$(am__dirstamp)
+src/elpa1/libelpa@SUFFIX@_private_la-elpa1_compute_private.lo:  \
+	src/elpa1/$(am__dirstamp) src/elpa1/$(DEPDIR)/$(am__dirstamp)
+src/elpa1/libelpa@SUFFIX@_private_la-elpa1_auxiliary.lo:  \
+	src/elpa1/$(am__dirstamp) src/elpa1/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/libelpa@SUFFIX@_private_la-elpa2_determine_workload.lo:  \
+	src/elpa2/$(am__dirstamp) src/elpa2/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/libelpa@SUFFIX@_private_la-elpa2_compute.lo:  \
+	src/elpa2/$(am__dirstamp) src/elpa2/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa2/kernels
+	@: > src/elpa2/kernels/$(am__dirstamp)
+src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa2/kernels/$(DEPDIR)
+	@: > src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-mod_single_hh_trafo_real.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/GPU/$(am__dirstamp):
+	@$(MKDIR_P) src/GPU
+	@: > src/GPU/$(am__dirstamp)
+src/GPU/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/GPU/$(DEPDIR)
+	@: > src/GPU/$(DEPDIR)/$(am__dirstamp)
+src/GPU/libelpa@SUFFIX@_private_la-check_for_gpu.lo:  \
+	src/GPU/$(am__dirstamp) src/GPU/$(DEPDIR)/$(am__dirstamp)
+src/GPU/libelpa@SUFFIX@_private_la-mod_cuda.lo:  \
+	src/GPU/$(am__dirstamp) src/GPU/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/GPU/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa2/GPU
+	@: > src/elpa2/GPU/$(am__dirstamp)
+src/elpa2/GPU/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa2/GPU/$(DEPDIR)
+	@: > src/elpa2/GPU/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/GPU/libelpa@SUFFIX@_private_la-interface_c_kernel.lo:  \
+	src/elpa2/GPU/$(am__dirstamp) \
+	src/elpa2/GPU/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/libelpa@SUFFIX@_private_la-mod_pack_unpack_gpu.lo:  \
+	src/elpa2/$(am__dirstamp) src/elpa2/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/qr/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa2/qr
+	@: > src/elpa2/qr/$(am__dirstamp)
+src/elpa2/qr/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa2/qr/$(DEPDIR)
+	@: > src/elpa2/qr/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/qr/libelpa@SUFFIX@_private_la-qr_utils.lo:  \
+	src/elpa2/qr/$(am__dirstamp) \
+	src/elpa2/qr/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_qrkernels.lo:  \
+	src/elpa2/qr/$(am__dirstamp) \
+	src/elpa2/qr/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_pdlarfb.lo:  \
+	src/elpa2/qr/$(am__dirstamp) \
+	src/elpa2/qr/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_pdgeqrf.lo:  \
+	src/elpa2/qr/$(am__dirstamp) \
+	src/elpa2/qr/$(DEPDIR)/$(am__dirstamp)
+src/elpa1/libelpa@SUFFIX@_private_la-elpa1.lo:  \
+	src/elpa1/$(am__dirstamp) src/elpa1/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/libelpa@SUFFIX@_private_la-elpa2.lo:  \
+	src/elpa2/$(am__dirstamp) src/elpa2/$(DEPDIR)/$(am__dirstamp)
+src/elpa_generalized/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa_generalized
+	@: > src/elpa_generalized/$(am__dirstamp)
+src/elpa_generalized/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/elpa_generalized/$(DEPDIR)
+	@: > src/elpa_generalized/$(DEPDIR)/$(am__dirstamp)
+src/elpa_generalized/cannon.lo: src/elpa_generalized/$(am__dirstamp) \
+	src/elpa_generalized/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-matrix_plot.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/general/$(am__dirstamp):
+	@$(MKDIR_P) src/general
+	@: > src/general/$(am__dirstamp)
+src/general/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) src/general/$(DEPDIR)
+	@: > src/general/$(DEPDIR)/$(am__dirstamp)
+src/general/libelpa@SUFFIX@_private_la-mod_elpa_skewsymmetric_blas.lo:  \
+	src/general/$(am__dirstamp) \
+	src/general/$(DEPDIR)/$(am__dirstamp)
+src/elpa_index.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
 src/elpa_c_interface.lo: src/$(am__dirstamp) \
 	src/$(DEPDIR)/$(am__dirstamp)
-src/elpa_qr/$(am__dirstamp):
-	@$(MKDIR_P) src/elpa_qr
-	@: > src/elpa_qr/$(am__dirstamp)
-src/elpa_qr/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) src/elpa_qr/$(DEPDIR)
-	@: > src/elpa_qr/$(DEPDIR)/$(am__dirstamp)
-src/elpa_qr/qr_utils.lo: src/elpa_qr/$(am__dirstamp) \
-	src/elpa_qr/$(DEPDIR)/$(am__dirstamp)
-src/elpa_qr/elpa_qrkernels.lo: src/elpa_qr/$(am__dirstamp) \
-	src/elpa_qr/$(DEPDIR)/$(am__dirstamp)
-src/elpa_qr/elpa_pdlarfb.lo: src/elpa_qr/$(am__dirstamp) \
-	src/elpa_qr/$(DEPDIR)/$(am__dirstamp)
-src/elpa_qr/elpa_pdgeqrf.lo: src/elpa_qr/$(am__dirstamp) \
-	src/elpa_qr/$(DEPDIR)/$(am__dirstamp)
-src/timer.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
+src/general/libelpa@SUFFIX@_private_la-elpa_utilities.lo:  \
+	src/general/$(am__dirstamp) \
+	src/general/$(DEPDIR)/$(am__dirstamp)
 src/ftimings/$(am__dirstamp):
 	@$(MKDIR_P) src/ftimings
 	@: > src/ftimings/$(am__dirstamp)
 src/ftimings/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) src/ftimings/$(DEPDIR)
 	@: > src/ftimings/$(DEPDIR)/$(am__dirstamp)
-src/ftimings/ftimings.lo: src/ftimings/$(am__dirstamp) \
+src/ftimings/libelpa@SUFFIX@_private_la-ftimings.lo:  \
+	src/ftimings/$(am__dirstamp) \
 	src/ftimings/$(DEPDIR)/$(am__dirstamp)
-src/ftimings/ftimings_type.lo: src/ftimings/$(am__dirstamp) \
+src/ftimings/libelpa@SUFFIX@_private_la-ftimings_type.lo:  \
+	src/ftimings/$(am__dirstamp) \
 	src/ftimings/$(DEPDIR)/$(am__dirstamp)
-src/ftimings/ftimings_value.lo: src/ftimings/$(am__dirstamp) \
+src/ftimings/libelpa@SUFFIX@_private_la-ftimings_value.lo:  \
+	src/ftimings/$(am__dirstamp) \
 	src/ftimings/$(DEPDIR)/$(am__dirstamp)
 src/ftimings/highwater_mark.lo: src/ftimings/$(am__dirstamp) \
 	src/ftimings/$(DEPDIR)/$(am__dirstamp)
@@ -1413,313 +12244,2739 @@
 	src/ftimings/$(DEPDIR)/$(am__dirstamp)
 src/ftimings/papi.lo: src/ftimings/$(am__dirstamp) \
 	src/ftimings/$(DEPDIR)/$(am__dirstamp)
-src/mod_time_c.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_complex.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_simple.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_complex_simple.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_bgp.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_bgq.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_sse_2hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_sse_4hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_sse_6hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.lo:  \
-	src/elpa2_kernels/$(am__dirstamp) \
-	src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-
-libelpa@SUFFIX@.la: $(libelpa@SUFFIX@_la_OBJECTS) $(libelpa@SUFFIX@_la_DEPENDENCIES) $(EXTRA_libelpa@SUFFIX@_la_DEPENDENCIES) 
-	$(AM_V_GEN)$(libelpa@SUFFIX@_la_LINK) -rpath $(libdir) $(libelpa@SUFFIX@_la_OBJECTS) $(libelpa@SUFFIX@_la_LIBADD) $(LIBS)
-install-binPROGRAMS: $(bin_PROGRAMS)
-	@$(NORMAL_INSTALL)
-	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
-	if test -n "$$list"; then \
-	  echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \
-	  $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \
-	fi; \
-	for p in $$list; do echo "$$p $$p"; done | \
-	sed 's/$(EXEEXT)$$//' | \
-	while read p p1; do if test -f $$p \
-	 || test -f $$p1 \
-	  ; then echo "$$p"; echo "$$p"; else :; fi; \
-	done | \
-	sed -e 'p;s,.*/,,;n;h' \
-	    -e 's|.*|.|' \
-	    -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \
-	sed 'N;N;N;s,\n, ,g' | \
-	$(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \
-	  { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
-	    if ($$2 == $$4) files[d] = files[d] " " $$1; \
-	    else { print "f", $$3 "/" $$4, $$1; } } \
-	  END { for (d in files) print "f", d, files[d] }' | \
-	while read type dir files; do \
-	    if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
-	    test -z "$$files" || { \
-	    echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \
-	    $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \
-	    } \
-	; done
-
-uninstall-binPROGRAMS:
-	@$(NORMAL_UNINSTALL)
-	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
-	files=`for p in $$list; do echo "$$p"; done | \
-	  sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \
-	      -e 's/$$/$(EXEEXT)/' \
-	`; \
-	test -n "$$list" || exit 0; \
-	echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \
-	cd "$(DESTDIR)$(bindir)" && rm -f $$files
-
-clean-binPROGRAMS:
-	@list='$(bin_PROGRAMS)'; test -n "$$list" || exit 0; \
-	echo " rm -f" $$list; \
-	rm -f $$list || exit $$?; \
-	test -n "$(EXEEXT)" || exit 0; \
-	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
-	echo " rm -f" $$list; \
-	rm -f $$list
-
-clean-noinstPROGRAMS:
-	@list='$(noinst_PROGRAMS)'; test -n "$$list" || exit 0; \
-	echo " rm -f" $$list; \
-	rm -f $$list || exit $$?; \
-	test -n "$(EXEEXT)" || exit 0; \
-	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
-	echo " rm -f" $$list; \
-	rm -f $$list
-test/fortran_test_programs/$(am__dirstamp):
-	@$(MKDIR_P) test/fortran_test_programs
-	@: > test/fortran_test_programs/$(am__dirstamp)
-test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) test/fortran_test_programs/$(DEPDIR)
-	@: > test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-test/fortran_test_programs/test_complex.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/$(am__dirstamp):
-	@$(MKDIR_P) test/shared_sources
-	@: > test/shared_sources/$(am__dirstamp)
-test/shared_sources/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) test/shared_sources/$(DEPDIR)
-	@: > test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/util.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/read_input_parameters.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/check_correctnes.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/setup_mpi.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/blacs_infrastructure.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/prepare_matrix.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/mod_output_types.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/redir.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/redirect.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-
-elpa1_test_complex@SUFFIX@$(EXEEXT): $(elpa1_test_complex@SUFFIX@_OBJECTS) $(elpa1_test_complex@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_complex@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa1_test_complex@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa1_test_complex@SUFFIX@_OBJECTS) $(elpa1_test_complex@SUFFIX@_LDADD) $(LIBS)
-test/c_test_programs/$(am__dirstamp):
-	@$(MKDIR_P) test/c_test_programs
-	@: > test/c_test_programs/$(am__dirstamp)
-test/c_test_programs/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) test/c_test_programs/$(DEPDIR)
-	@: > test/c_test_programs/$(DEPDIR)/$(am__dirstamp)
-test/c_test_programs/elpa1_test_complex_c_version.$(OBJEXT):  \
-	test/c_test_programs/$(am__dirstamp) \
-	test/c_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa1_test_complex_c_version@SUFFIX@$(EXEEXT): $(elpa1_test_complex_c_version@SUFFIX@_OBJECTS) $(elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa1_test_complex_c_version@SUFFIX@$(EXEEXT)
-	$(AM_V_GEN)$(elpa1_test_complex_c_version@SUFFIX@_LINK) $(elpa1_test_complex_c_version@SUFFIX@_OBJECTS) $(elpa1_test_complex_c_version@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_real.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa1_test_real@SUFFIX@$(EXEEXT): $(elpa1_test_real@SUFFIX@_OBJECTS) $(elpa1_test_real@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_real@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa1_test_real@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa1_test_real@SUFFIX@_OBJECTS) $(elpa1_test_real@SUFFIX@_LDADD) $(LIBS)
-test/c_test_programs/elpa1_test_real_c_version.$(OBJEXT):  \
-	test/c_test_programs/$(am__dirstamp) \
-	test/c_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa1_test_real_c_version@SUFFIX@$(EXEEXT): $(elpa1_test_real_c_version@SUFFIX@_OBJECTS) $(elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa1_test_real_c_version@SUFFIX@$(EXEEXT)
-	$(AM_V_GEN)$(elpa1_test_real_c_version@SUFFIX@_LINK) $(elpa1_test_real_c_version@SUFFIX@_OBJECTS) $(elpa1_test_real_c_version@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_real_with_c.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/mod_from_c.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-test/shared_sources/call_elpa1.$(OBJEXT):  \
-	test/shared_sources/$(am__dirstamp) \
-	test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-
-elpa1_test_real_with_c@SUFFIX@$(EXEEXT): $(elpa1_test_real_with_c@SUFFIX@_OBJECTS) $(elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa1_test_real_with_c@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa1_test_real_with_c@SUFFIX@_OBJECTS) $(elpa1_test_real_with_c@SUFFIX@_LDADD) $(LIBS)
-src/elpa2_print_kernels.$(OBJEXT): src/$(am__dirstamp) \
+src/helpers/libelpa@SUFFIX@_private_la-timer_dummy.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/GPU/cudaFunctions.lo: src/GPU/$(am__dirstamp) \
+	src/GPU/$(DEPDIR)/$(am__dirstamp)
+src/GPU/cuUtils.lo: src/GPU/$(am__dirstamp) \
+	src/GPU/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/GPU/ev_tridi_band_gpu_c_v2.lo:  \
+	src/elpa2/GPU/$(am__dirstamp) \
+	src/elpa2/GPU/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-mod_time_c.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/helpers/get_cpuid_set.lo: src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/helpers/libelpa@SUFFIX@_private_la-mod_simd_kernel.lo:  \
+	src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-complex.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-complex_simple.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple_block4.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple_block6.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_bgp.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_bgq.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/asm_x86_64_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/asm_x86_64_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sparc64_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_neon_arch64_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_neon_arch64_2hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_vsx_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_vsx_2hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sse_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sse_2hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx-avx2_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx-avx2_2hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx512_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx512_2hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sparc64_4hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_neon_arch64_4hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_neon_arch64_4hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_vsx_4hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_vsx_4hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sse_4hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sse_4hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx-avx2_4hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx-avx2_4hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx512_4hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx512_4hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sparc64_6hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_neon_arch64_6hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_neon_arch64_6hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_vsx_6hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_vsx_6hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sse_6hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_sse_6hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx-avx2_6hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx-avx2_6hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx512_6hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/real_avx512_6hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_sse_1hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_sse_1hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_avx512_1hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_avx512_1hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_sse_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_sse_2hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_avx512_2hv_double_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/elpa2/kernels/complex_avx512_2hv_single_precision.lo:  \
+	src/elpa2/kernels/$(am__dirstamp) \
+	src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+src/helpers/print_build_config.lo: src/helpers/$(am__dirstamp) \
+	src/helpers/$(DEPDIR)/$(am__dirstamp)
+
+libelpa@SUFFIX@_private.la: $(libelpa@SUFFIX@_private_la_OBJECTS) $(libelpa@SUFFIX@_private_la_DEPENDENCIES) $(EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES) 
+	$(AM_V_FCLD)$(libelpa@SUFFIX@_private_la_LINK)  $(libelpa@SUFFIX@_private_la_OBJECTS) $(libelpa@SUFFIX@_private_la_LIBADD) $(LIBS)
+src/libelpa@SUFFIX@_public_la-elpa.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+src/libelpa@SUFFIX@_public_la-elpa_api.lo: src/$(am__dirstamp) \
 	src/$(DEPDIR)/$(am__dirstamp)
+src/libelpa@SUFFIX@_public_la-elpa_constants.lo: src/$(am__dirstamp) \
+	src/$(DEPDIR)/$(am__dirstamp)
+
+libelpa@SUFFIX@_public.la: $(libelpa@SUFFIX@_public_la_OBJECTS) $(libelpa@SUFFIX@_public_la_DEPENDENCIES) $(EXTRA_libelpa@SUFFIX@_public_la_DEPENDENCIES) 
+	$(AM_V_FCLD)$(libelpa@SUFFIX@_public_la_LINK)  $(libelpa@SUFFIX@_public_la_OBJECTS) $(libelpa@SUFFIX@_public_la_LIBADD) $(LIBS)
+test/shared/$(am__dirstamp):
+	@$(MKDIR_P) test/shared
+	@: > test/shared/$(am__dirstamp)
+test/shared/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/shared/$(DEPDIR)
+	@: > test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-tests_variable_definitions.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-mod_tests_scalapack_interfaces.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-mod_tests_blas_interfaces.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_util.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_read_input_parameters.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_check_correctness.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_setup_mpi.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_blacs_infrastructure.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_prepare_matrix.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_analytic.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_output_type.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_scalapack.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/test_redir.lo: test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+test/shared/libelpatest@SUFFIX@_la-test_redirect.lo:  \
+	test/shared/$(am__dirstamp) \
+	test/shared/$(DEPDIR)/$(am__dirstamp)
+
+libelpatest@SUFFIX@.la: $(libelpatest@SUFFIX@_la_OBJECTS) $(libelpatest@SUFFIX@_la_DEPENDENCIES) $(EXTRA_libelpatest@SUFFIX@_la_DEPENDENCIES) 
+	$(AM_V_FCLD)$(libelpatest@SUFFIX@_la_LINK)  $(libelpatest@SUFFIX@_la_OBJECTS) $(libelpatest@SUFFIX@_la_LIBADD) $(LIBS)
+python/pyelpa/$(am__dirstamp):
+	@$(MKDIR_P) python/pyelpa
+	@: > python/pyelpa/$(am__dirstamp)
+python/pyelpa/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) python/pyelpa/$(DEPDIR)
+	@: > python/pyelpa/$(DEPDIR)/$(am__dirstamp)
+python/pyelpa/wrapper_la-wrapper.lo: python/pyelpa/$(am__dirstamp) \
+	python/pyelpa/$(DEPDIR)/$(am__dirstamp)
+
+wrapper.la: $(wrapper_la_OBJECTS) $(wrapper_la_DEPENDENCIES) $(EXTRA_wrapper_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(wrapper_la_LINK) $(am_wrapper_la_rpath) $(wrapper_la_OBJECTS) $(wrapper_la_LIBADD) $(LIBS)
+src/elpa2/elpa2_print_kernels@SUFFIX@-elpa2_print_kernels.$(OBJEXT):  \
+	src/elpa2/$(am__dirstamp) src/elpa2/$(DEPDIR)/$(am__dirstamp)
 
 elpa2_print_kernels@SUFFIX@$(EXEEXT): $(elpa2_print_kernels@SUFFIX@_OBJECTS) $(elpa2_print_kernels@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_print_kernels@SUFFIX@_DEPENDENCIES) 
 	@rm -f elpa2_print_kernels@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa2_print_kernels@SUFFIX@_OBJECTS) $(elpa2_print_kernels@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_complex2.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_complex@SUFFIX@$(EXEEXT): $(elpa2_test_complex@SUFFIX@_OBJECTS) $(elpa2_test_complex@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_complex@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_complex@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa2_test_complex@SUFFIX@_OBJECTS) $(elpa2_test_complex@SUFFIX@_LDADD) $(LIBS)
-test/c_test_programs/elpa2_test_complex_c_version.$(OBJEXT):  \
-	test/c_test_programs/$(am__dirstamp) \
-	test/c_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_complex_c_version@SUFFIX@$(EXEEXT): $(elpa2_test_complex_c_version@SUFFIX@_OBJECTS) $(elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_complex_c_version@SUFFIX@$(EXEEXT)
-	$(AM_V_GEN)$(elpa2_test_complex_c_version@SUFFIX@_LINK) $(elpa2_test_complex_c_version@SUFFIX@_OBJECTS) $(elpa2_test_complex_c_version@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_complex2_choose_kernel_with_api.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@$(EXEEXT): $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS) $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_complex_choose_kernel_with_api@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS) $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_complex2_default_kernel.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_complex_default_kernel@SUFFIX@$(EXEEXT): $(elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS) $(elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_complex_default_kernel@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS) $(elpa2_test_complex_default_kernel@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_real2.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_real@SUFFIX@$(EXEEXT): $(elpa2_test_real@SUFFIX@_OBJECTS) $(elpa2_test_real@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_real@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa2_test_real@SUFFIX@_OBJECTS) $(elpa2_test_real@SUFFIX@_LDADD) $(LIBS)
-test/c_test_programs/elpa2_test_real_c_version.$(OBJEXT):  \
-	test/c_test_programs/$(am__dirstamp) \
-	test/c_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_real_c_version@SUFFIX@$(EXEEXT): $(elpa2_test_real_c_version@SUFFIX@_OBJECTS) $(elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_real_c_version@SUFFIX@$(EXEEXT)
-	$(AM_V_GEN)$(elpa2_test_real_c_version@SUFFIX@_LINK) $(elpa2_test_real_c_version@SUFFIX@_OBJECTS) $(elpa2_test_real_c_version@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_real2_choose_kernel_with_api.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_real_choose_kernel_with_api@SUFFIX@$(EXEEXT): $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS) $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_real_choose_kernel_with_api@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS) $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_real2_default_kernel.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_real_default_kernel@SUFFIX@$(EXEEXT): $(elpa2_test_real_default_kernel@SUFFIX@_OBJECTS) $(elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_real_default_kernel@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa2_test_real_default_kernel@SUFFIX@_OBJECTS) $(elpa2_test_real_default_kernel@SUFFIX@_LDADD) $(LIBS)
-test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.$(OBJEXT):  \
-	test/fortran_test_programs/$(am__dirstamp) \
-	test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@$(EXEEXT): $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS) $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES) 
-	@rm -f elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@$(EXEEXT)
-	$(AM_V_FCLD)$(FCLINK) $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS) $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_LDADD) $(LIBS)
+	$(AM_V_FCLD)$(elpa2_print_kernels@SUFFIX@_LINK) $(elpa2_print_kernels@SUFFIX@_OBJECTS) $(elpa2_print_kernels@SUFFIX@_LDADD) $(LIBS)
+test/Fortran/$(am__dirstamp):
+	@$(MKDIR_P) test/Fortran
+	@: > test/Fortran/$(am__dirstamp)
+test/Fortran/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/Fortran/$(DEPDIR)
+	@: > test/Fortran/$(DEPDIR)/$(am__dirstamp)
+test/Fortran/skewsymmetric_real_double-test_skewsymmetric.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+test_skewsymmetric_real_double$(EXEEXT): $(test_skewsymmetric_real_double_OBJECTS) $(test_skewsymmetric_real_double_DEPENDENCIES) $(EXTRA_test_skewsymmetric_real_double_DEPENDENCIES) 
+	@rm -f test_skewsymmetric_real_double$(EXEEXT)
+	$(AM_V_FCLD)$(test_skewsymmetric_real_double_LINK) $(test_skewsymmetric_real_double_OBJECTS) $(test_skewsymmetric_real_double_LDADD) $(LIBS)
+test/Fortran/skewsymmetric_real_single-test_skewsymmetric.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+test_skewsymmetric_real_single$(EXEEXT): $(test_skewsymmetric_real_single_OBJECTS) $(test_skewsymmetric_real_single_DEPENDENCIES) $(EXTRA_test_skewsymmetric_real_single_DEPENDENCIES) 
+	@rm -f test_skewsymmetric_real_single$(EXEEXT)
+	$(AM_V_FCLD)$(test_skewsymmetric_real_single_LINK) $(test_skewsymmetric_real_single_OBJECTS) $(test_skewsymmetric_real_single_LDADD) $(LIBS)
+test/C/$(am__dirstamp):
+	@$(MKDIR_P) test/C
+	@: > test/C/$(am__dirstamp)
+test/C/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/C/$(DEPDIR)
+	@: > test/C/$(DEPDIR)/$(am__dirstamp)
+test/C/validate_autotune_c_version_complex_double-test_autotune.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_autotune_c_version_complex_double$(EXEEXT): $(validate_autotune_c_version_complex_double_OBJECTS) $(validate_autotune_c_version_complex_double_DEPENDENCIES) $(EXTRA_validate_autotune_c_version_complex_double_DEPENDENCIES) 
+	@rm -f validate_autotune_c_version_complex_double$(EXEEXT)
+	$(AM_V_CCLD)$(validate_autotune_c_version_complex_double_LINK) $(validate_autotune_c_version_complex_double_OBJECTS) $(validate_autotune_c_version_complex_double_LDADD) $(LIBS)
+test/C/validate_autotune_c_version_complex_single-test_autotune.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_autotune_c_version_complex_single$(EXEEXT): $(validate_autotune_c_version_complex_single_OBJECTS) $(validate_autotune_c_version_complex_single_DEPENDENCIES) $(EXTRA_validate_autotune_c_version_complex_single_DEPENDENCIES) 
+	@rm -f validate_autotune_c_version_complex_single$(EXEEXT)
+	$(AM_V_CCLD)$(validate_autotune_c_version_complex_single_LINK) $(validate_autotune_c_version_complex_single_OBJECTS) $(validate_autotune_c_version_complex_single_LDADD) $(LIBS)
+test/C/validate_autotune_c_version_real_double-test_autotune.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_autotune_c_version_real_double$(EXEEXT): $(validate_autotune_c_version_real_double_OBJECTS) $(validate_autotune_c_version_real_double_DEPENDENCIES) $(EXTRA_validate_autotune_c_version_real_double_DEPENDENCIES) 
+	@rm -f validate_autotune_c_version_real_double$(EXEEXT)
+	$(AM_V_CCLD)$(validate_autotune_c_version_real_double_LINK) $(validate_autotune_c_version_real_double_OBJECTS) $(validate_autotune_c_version_real_double_LDADD) $(LIBS)
+test/C/validate_autotune_c_version_real_single-test_autotune.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_autotune_c_version_real_single$(EXEEXT): $(validate_autotune_c_version_real_single_OBJECTS) $(validate_autotune_c_version_real_single_DEPENDENCIES) $(EXTRA_validate_autotune_c_version_real_single_DEPENDENCIES) 
+	@rm -f validate_autotune_c_version_real_single$(EXEEXT)
+	$(AM_V_CCLD)$(validate_autotune_c_version_real_single_LINK) $(validate_autotune_c_version_real_single_OBJECTS) $(validate_autotune_c_version_real_single_LDADD) $(LIBS)
+test/Fortran/validate_autotune_complex_double-test_autotune.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_autotune_complex_double$(EXEEXT): $(validate_autotune_complex_double_OBJECTS) $(validate_autotune_complex_double_DEPENDENCIES) $(EXTRA_validate_autotune_complex_double_DEPENDENCIES) 
+	@rm -f validate_autotune_complex_double$(EXEEXT)
+	$(AM_V_FCLD)$(validate_autotune_complex_double_LINK) $(validate_autotune_complex_double_OBJECTS) $(validate_autotune_complex_double_LDADD) $(LIBS)
+test/Fortran/validate_autotune_complex_single-test_autotune.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_autotune_complex_single$(EXEEXT): $(validate_autotune_complex_single_OBJECTS) $(validate_autotune_complex_single_DEPENDENCIES) $(EXTRA_validate_autotune_complex_single_DEPENDENCIES) 
+	@rm -f validate_autotune_complex_single$(EXEEXT)
+	$(AM_V_FCLD)$(validate_autotune_complex_single_LINK) $(validate_autotune_complex_single_OBJECTS) $(validate_autotune_complex_single_LDADD) $(LIBS)
+test/Fortran/validate_autotune_real_double-test_autotune.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_autotune_real_double$(EXEEXT): $(validate_autotune_real_double_OBJECTS) $(validate_autotune_real_double_DEPENDENCIES) $(EXTRA_validate_autotune_real_double_DEPENDENCIES) 
+	@rm -f validate_autotune_real_double$(EXEEXT)
+	$(AM_V_FCLD)$(validate_autotune_real_double_LINK) $(validate_autotune_real_double_OBJECTS) $(validate_autotune_real_double_LDADD) $(LIBS)
+test/Fortran/validate_autotune_real_single-test_autotune.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_autotune_real_single$(EXEEXT): $(validate_autotune_real_single_OBJECTS) $(validate_autotune_real_single_DEPENDENCIES) $(EXTRA_validate_autotune_real_single_DEPENDENCIES) 
+	@rm -f validate_autotune_real_single$(EXEEXT)
+	$(AM_V_FCLD)$(validate_autotune_real_single_LINK) $(validate_autotune_real_single_OBJECTS) $(validate_autotune_real_single_LDADD) $(LIBS)
+test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_double_eigenvectors_1stage_gpu_random$(EXEEXT): $(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_double_eigenvectors_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_double_eigenvectors_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_LINK) $(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_double_eigenvectors_1stage_random$(EXEEXT): $(validate_c_version_complex_double_eigenvectors_1stage_random_OBJECTS) $(validate_c_version_complex_double_eigenvectors_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_double_eigenvectors_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_double_eigenvectors_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_double_eigenvectors_1stage_random_LINK) $(validate_c_version_complex_double_eigenvectors_1stage_random_OBJECTS) $(validate_c_version_complex_double_eigenvectors_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT): $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LINK) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random$(EXEEXT): $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_LINK) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_double_generalized_1stage_gpu_random$(EXEEXT): $(validate_c_version_complex_double_generalized_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_double_generalized_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_double_generalized_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_double_generalized_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_double_generalized_1stage_gpu_random_LINK) $(validate_c_version_complex_double_generalized_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_double_generalized_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_double_generalized_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_double_generalized_1stage_random$(EXEEXT): $(validate_c_version_complex_double_generalized_1stage_random_OBJECTS) $(validate_c_version_complex_double_generalized_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_double_generalized_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_double_generalized_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_double_generalized_1stage_random_LINK) $(validate_c_version_complex_double_generalized_1stage_random_OBJECTS) $(validate_c_version_complex_double_generalized_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_double_generalized_decomp_1stage_gpu_random$(EXEEXT): $(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_LINK) $(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_double_generalized_decomp_1stage_random$(EXEEXT): $(validate_c_version_complex_double_generalized_decomp_1stage_random_OBJECTS) $(validate_c_version_complex_double_generalized_decomp_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_double_generalized_decomp_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_double_generalized_decomp_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_double_generalized_decomp_1stage_random_LINK) $(validate_c_version_complex_double_generalized_decomp_1stage_random_OBJECTS) $(validate_c_version_complex_double_generalized_decomp_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_single_eigenvectors_1stage_gpu_random$(EXEEXT): $(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_single_eigenvectors_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_single_eigenvectors_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_LINK) $(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_single_eigenvectors_1stage_random$(EXEEXT): $(validate_c_version_complex_single_eigenvectors_1stage_random_OBJECTS) $(validate_c_version_complex_single_eigenvectors_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_single_eigenvectors_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_single_eigenvectors_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_single_eigenvectors_1stage_random_LINK) $(validate_c_version_complex_single_eigenvectors_1stage_random_OBJECTS) $(validate_c_version_complex_single_eigenvectors_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT): $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LINK) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random$(EXEEXT): $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_LINK) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_single_generalized_1stage_gpu_random$(EXEEXT): $(validate_c_version_complex_single_generalized_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_single_generalized_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_single_generalized_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_single_generalized_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_single_generalized_1stage_gpu_random_LINK) $(validate_c_version_complex_single_generalized_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_single_generalized_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_single_generalized_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_single_generalized_1stage_random$(EXEEXT): $(validate_c_version_complex_single_generalized_1stage_random_OBJECTS) $(validate_c_version_complex_single_generalized_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_single_generalized_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_single_generalized_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_single_generalized_1stage_random_LINK) $(validate_c_version_complex_single_generalized_1stage_random_OBJECTS) $(validate_c_version_complex_single_generalized_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_single_generalized_decomp_1stage_gpu_random$(EXEEXT): $(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_LINK) $(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_complex_single_generalized_decomp_1stage_random$(EXEEXT): $(validate_c_version_complex_single_generalized_decomp_1stage_random_OBJECTS) $(validate_c_version_complex_single_generalized_decomp_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_complex_single_generalized_decomp_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_complex_single_generalized_decomp_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_complex_single_generalized_decomp_1stage_random_LINK) $(validate_c_version_complex_single_generalized_decomp_1stage_random_OBJECTS) $(validate_c_version_complex_single_generalized_decomp_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_double_eigenvectors_1stage_gpu_random$(EXEEXT): $(validate_c_version_real_double_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_c_version_real_double_eigenvectors_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_double_eigenvectors_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_double_eigenvectors_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_double_eigenvectors_1stage_gpu_random_LINK) $(validate_c_version_real_double_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_c_version_real_double_eigenvectors_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_double_eigenvectors_1stage_random$(EXEEXT): $(validate_c_version_real_double_eigenvectors_1stage_random_OBJECTS) $(validate_c_version_real_double_eigenvectors_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_double_eigenvectors_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_double_eigenvectors_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_double_eigenvectors_1stage_random_LINK) $(validate_c_version_real_double_eigenvectors_1stage_random_OBJECTS) $(validate_c_version_real_double_eigenvectors_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT): $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_LINK) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_random$(EXEEXT): $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_LINK) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_double_generalized_1stage_gpu_random$(EXEEXT): $(validate_c_version_real_double_generalized_1stage_gpu_random_OBJECTS) $(validate_c_version_real_double_generalized_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_double_generalized_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_double_generalized_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_double_generalized_1stage_gpu_random_LINK) $(validate_c_version_real_double_generalized_1stage_gpu_random_OBJECTS) $(validate_c_version_real_double_generalized_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_double_generalized_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_double_generalized_1stage_random$(EXEEXT): $(validate_c_version_real_double_generalized_1stage_random_OBJECTS) $(validate_c_version_real_double_generalized_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_double_generalized_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_double_generalized_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_double_generalized_1stage_random_LINK) $(validate_c_version_real_double_generalized_1stage_random_OBJECTS) $(validate_c_version_real_double_generalized_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_double_generalized_decomp_1stage_gpu_random$(EXEEXT): $(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_LINK) $(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_double_generalized_decomp_1stage_random$(EXEEXT): $(validate_c_version_real_double_generalized_decomp_1stage_random_OBJECTS) $(validate_c_version_real_double_generalized_decomp_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_double_generalized_decomp_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_double_generalized_decomp_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_double_generalized_decomp_1stage_random_LINK) $(validate_c_version_real_double_generalized_decomp_1stage_random_OBJECTS) $(validate_c_version_real_double_generalized_decomp_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_single_eigenvectors_1stage_gpu_random$(EXEEXT): $(validate_c_version_real_single_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_c_version_real_single_eigenvectors_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_single_eigenvectors_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_single_eigenvectors_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_single_eigenvectors_1stage_gpu_random_LINK) $(validate_c_version_real_single_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_c_version_real_single_eigenvectors_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_single_eigenvectors_1stage_random$(EXEEXT): $(validate_c_version_real_single_eigenvectors_1stage_random_OBJECTS) $(validate_c_version_real_single_eigenvectors_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_single_eigenvectors_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_single_eigenvectors_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_single_eigenvectors_1stage_random_LINK) $(validate_c_version_real_single_eigenvectors_1stage_random_OBJECTS) $(validate_c_version_real_single_eigenvectors_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT): $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_LINK) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_random$(EXEEXT): $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_LINK) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_single_generalized_1stage_gpu_random$(EXEEXT): $(validate_c_version_real_single_generalized_1stage_gpu_random_OBJECTS) $(validate_c_version_real_single_generalized_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_single_generalized_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_single_generalized_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_single_generalized_1stage_gpu_random_LINK) $(validate_c_version_real_single_generalized_1stage_gpu_random_OBJECTS) $(validate_c_version_real_single_generalized_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_single_generalized_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_single_generalized_1stage_random$(EXEEXT): $(validate_c_version_real_single_generalized_1stage_random_OBJECTS) $(validate_c_version_real_single_generalized_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_single_generalized_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_single_generalized_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_single_generalized_1stage_random_LINK) $(validate_c_version_real_single_generalized_1stage_random_OBJECTS) $(validate_c_version_real_single_generalized_1stage_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_single_generalized_decomp_1stage_gpu_random$(EXEEXT): $(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_LINK) $(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_LDADD) $(LIBS)
+test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_c_version_real_single_generalized_decomp_1stage_random$(EXEEXT): $(validate_c_version_real_single_generalized_decomp_1stage_random_OBJECTS) $(validate_c_version_real_single_generalized_decomp_1stage_random_DEPENDENCIES) $(EXTRA_validate_c_version_real_single_generalized_decomp_1stage_random_DEPENDENCIES) 
+	@rm -f validate_c_version_real_single_generalized_decomp_1stage_random$(EXEEXT)
+	$(AM_V_CCLD)$(validate_c_version_real_single_generalized_decomp_1stage_random_LINK) $(validate_c_version_real_single_generalized_decomp_1stage_random_OBJECTS) $(validate_c_version_real_single_generalized_decomp_1stage_random_LDADD) $(LIBS)
+test/Fortran/elpa2/$(am__dirstamp):
+	@$(MKDIR_P) test/Fortran/elpa2
+	@: > test/Fortran/elpa2/$(am__dirstamp)
+test/Fortran/elpa2/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/Fortran/elpa2/$(DEPDIR)
+	@: > test/Fortran/elpa2/$(DEPDIR)/$(am__dirstamp)
+test/Fortran/elpa2/validate_complex_2stage_banded@SUFFIX@-complex_2stage_banded.$(OBJEXT):  \
+	test/Fortran/elpa2/$(am__dirstamp) \
+	test/Fortran/elpa2/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_2stage_banded@SUFFIX@$(EXEEXT): $(validate_complex_2stage_banded@SUFFIX@_OBJECTS) $(validate_complex_2stage_banded@SUFFIX@_DEPENDENCIES) $(EXTRA_validate_complex_2stage_banded@SUFFIX@_DEPENDENCIES) 
+	@rm -f validate_complex_2stage_banded@SUFFIX@$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_2stage_banded@SUFFIX@_LINK) $(validate_complex_2stage_banded@SUFFIX@_OBJECTS) $(validate_complex_2stage_banded@SUFFIX@_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_cholesky_1stage_gpu_random$(EXEEXT): $(validate_complex_double_cholesky_1stage_gpu_random_OBJECTS) $(validate_complex_double_cholesky_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_double_cholesky_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_cholesky_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_cholesky_1stage_gpu_random_LINK) $(validate_complex_double_cholesky_1stage_gpu_random_OBJECTS) $(validate_complex_double_cholesky_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_cholesky_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_LINK) $(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_cholesky_1stage_gpu_toeplitz$(EXEEXT): $(validate_complex_double_cholesky_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_cholesky_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_cholesky_1stage_gpu_toeplitz_LINK) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_cholesky_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_cholesky_1stage_random$(EXEEXT): $(validate_complex_double_cholesky_1stage_random_OBJECTS) $(validate_complex_double_cholesky_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_double_cholesky_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_cholesky_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_cholesky_1stage_random_LINK) $(validate_complex_double_cholesky_1stage_random_OBJECTS) $(validate_complex_double_cholesky_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_cholesky_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_cholesky_1stage_random_all_layouts$(EXEEXT): $(validate_complex_double_cholesky_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_cholesky_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_cholesky_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_cholesky_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_cholesky_1stage_random_all_layouts_LINK) $(validate_complex_double_cholesky_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_cholesky_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_cholesky_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_cholesky_1stage_toeplitz$(EXEEXT): $(validate_complex_double_cholesky_1stage_toeplitz_OBJECTS) $(validate_complex_double_cholesky_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_cholesky_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_cholesky_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_cholesky_1stage_toeplitz_LINK) $(validate_complex_double_cholesky_1stage_toeplitz_OBJECTS) $(validate_complex_double_cholesky_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_cholesky_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_cholesky_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_LINK) $(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz$(EXEEXT): $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_LINK) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvalues_1stage_toeplitz$(EXEEXT): $(validate_complex_double_eigenvalues_1stage_toeplitz_OBJECTS) $(validate_complex_double_eigenvalues_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvalues_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvalues_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvalues_1stage_toeplitz_LINK) $(validate_complex_double_eigenvalues_1stage_toeplitz_OBJECTS) $(validate_complex_double_eigenvalues_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT): $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LINK) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT): $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_LINK) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_analytic$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_analytic_OBJECTS) $(validate_complex_double_eigenvectors_1stage_analytic_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_analytic_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_analytic_LINK) $(validate_complex_double_eigenvectors_1stage_analytic_OBJECTS) $(validate_complex_double_eigenvectors_1stage_analytic_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_analytic_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_LINK) $(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_gpu_random$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_complex_double_eigenvectors_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_gpu_random_LINK) $(validate_complex_double_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_complex_double_eigenvectors_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_LINK) $(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_LINK) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_random$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_random_OBJECTS) $(validate_complex_double_eigenvectors_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_random_LINK) $(validate_complex_double_eigenvectors_1stage_random_OBJECTS) $(validate_complex_double_eigenvectors_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_random_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_random_all_layouts_LINK) $(validate_complex_double_eigenvectors_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_toeplitz$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_toeplitz_LINK) $(validate_complex_double_eigenvectors_1stage_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_random$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_random_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_random_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_random$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_random_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LINK) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_scalapack_all_analytic$(EXEEXT): $(validate_complex_double_eigenvectors_scalapack_all_analytic_OBJECTS) $(validate_complex_double_eigenvectors_scalapack_all_analytic_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_scalapack_all_analytic_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_scalapack_all_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_scalapack_all_analytic_LINK) $(validate_complex_double_eigenvectors_scalapack_all_analytic_OBJECTS) $(validate_complex_double_eigenvectors_scalapack_all_analytic_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_LINK) $(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_scalapack_part_analytic$(EXEEXT): $(validate_complex_double_eigenvectors_scalapack_part_analytic_OBJECTS) $(validate_complex_double_eigenvectors_scalapack_part_analytic_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_scalapack_part_analytic_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_scalapack_part_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_scalapack_part_analytic_LINK) $(validate_complex_double_eigenvectors_scalapack_part_analytic_OBJECTS) $(validate_complex_double_eigenvectors_scalapack_part_analytic_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts$(EXEEXT): $(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_LINK) $(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS) $(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_generalized_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_generalized_1stage_gpu_random$(EXEEXT): $(validate_complex_double_generalized_1stage_gpu_random_OBJECTS) $(validate_complex_double_generalized_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_double_generalized_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_generalized_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_generalized_1stage_gpu_random_LINK) $(validate_complex_double_generalized_1stage_gpu_random_OBJECTS) $(validate_complex_double_generalized_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_generalized_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_generalized_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_double_generalized_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_generalized_1stage_gpu_random_all_layouts_LINK) $(validate_complex_double_generalized_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_generalized_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_generalized_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_generalized_1stage_random$(EXEEXT): $(validate_complex_double_generalized_1stage_random_OBJECTS) $(validate_complex_double_generalized_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_double_generalized_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_generalized_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_generalized_1stage_random_LINK) $(validate_complex_double_generalized_1stage_random_OBJECTS) $(validate_complex_double_generalized_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_generalized_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_generalized_1stage_random_all_layouts$(EXEEXT): $(validate_complex_double_generalized_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_generalized_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_generalized_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_generalized_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_generalized_1stage_random_all_layouts_LINK) $(validate_complex_double_generalized_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_generalized_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_generalized_decomp_1stage_gpu_random$(EXEEXT): $(validate_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_complex_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_generalized_decomp_1stage_gpu_random_LINK) $(validate_complex_double_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_complex_double_generalized_decomp_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_LINK) $(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_generalized_decomp_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_generalized_decomp_1stage_random$(EXEEXT): $(validate_complex_double_generalized_decomp_1stage_random_OBJECTS) $(validate_complex_double_generalized_decomp_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_double_generalized_decomp_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_generalized_decomp_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_generalized_decomp_1stage_random_LINK) $(validate_complex_double_generalized_decomp_1stage_random_OBJECTS) $(validate_complex_double_generalized_decomp_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_generalized_decomp_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_generalized_decomp_1stage_random_all_layouts$(EXEEXT): $(validate_complex_double_generalized_decomp_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_generalized_decomp_1stage_random_all_layouts_LINK) $(validate_complex_double_generalized_decomp_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_generalized_decomp_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_hermitian_multiply_1stage_gpu_random$(EXEEXT): $(validate_complex_double_hermitian_multiply_1stage_gpu_random_OBJECTS) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_double_hermitian_multiply_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_hermitian_multiply_1stage_gpu_random_LINK) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_OBJECTS) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_LINK) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_hermitian_multiply_1stage_random$(EXEEXT): $(validate_complex_double_hermitian_multiply_1stage_random_OBJECTS) $(validate_complex_double_hermitian_multiply_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_double_hermitian_multiply_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_double_hermitian_multiply_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_hermitian_multiply_1stage_random_LINK) $(validate_complex_double_hermitian_multiply_1stage_random_OBJECTS) $(validate_complex_double_hermitian_multiply_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_double_hermitian_multiply_1stage_random_all_layouts$(EXEEXT): $(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_double_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_double_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_LINK) $(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS) $(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_cholesky_1stage_gpu_random$(EXEEXT): $(validate_complex_single_cholesky_1stage_gpu_random_OBJECTS) $(validate_complex_single_cholesky_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_single_cholesky_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_cholesky_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_cholesky_1stage_gpu_random_LINK) $(validate_complex_single_cholesky_1stage_gpu_random_OBJECTS) $(validate_complex_single_cholesky_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_cholesky_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_LINK) $(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_cholesky_1stage_gpu_toeplitz$(EXEEXT): $(validate_complex_single_cholesky_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_cholesky_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_cholesky_1stage_gpu_toeplitz_LINK) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_cholesky_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_cholesky_1stage_random$(EXEEXT): $(validate_complex_single_cholesky_1stage_random_OBJECTS) $(validate_complex_single_cholesky_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_single_cholesky_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_cholesky_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_cholesky_1stage_random_LINK) $(validate_complex_single_cholesky_1stage_random_OBJECTS) $(validate_complex_single_cholesky_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_cholesky_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_cholesky_1stage_random_all_layouts$(EXEEXT): $(validate_complex_single_cholesky_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_cholesky_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_cholesky_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_cholesky_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_cholesky_1stage_random_all_layouts_LINK) $(validate_complex_single_cholesky_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_cholesky_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_cholesky_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_cholesky_1stage_toeplitz$(EXEEXT): $(validate_complex_single_cholesky_1stage_toeplitz_OBJECTS) $(validate_complex_single_cholesky_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_cholesky_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_cholesky_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_cholesky_1stage_toeplitz_LINK) $(validate_complex_single_cholesky_1stage_toeplitz_OBJECTS) $(validate_complex_single_cholesky_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_cholesky_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_cholesky_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_LINK) $(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz$(EXEEXT): $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_LINK) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvalues_1stage_toeplitz$(EXEEXT): $(validate_complex_single_eigenvalues_1stage_toeplitz_OBJECTS) $(validate_complex_single_eigenvalues_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvalues_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvalues_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvalues_1stage_toeplitz_LINK) $(validate_complex_single_eigenvalues_1stage_toeplitz_OBJECTS) $(validate_complex_single_eigenvalues_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT): $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LINK) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT): $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_LINK) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_analytic$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_analytic_OBJECTS) $(validate_complex_single_eigenvectors_1stage_analytic_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_analytic_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_analytic_LINK) $(validate_complex_single_eigenvectors_1stage_analytic_OBJECTS) $(validate_complex_single_eigenvectors_1stage_analytic_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_analytic_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_LINK) $(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_gpu_random$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_complex_single_eigenvectors_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_gpu_random_LINK) $(validate_complex_single_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_complex_single_eigenvectors_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_LINK) $(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_LINK) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_random$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_random_OBJECTS) $(validate_complex_single_eigenvectors_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_random_LINK) $(validate_complex_single_eigenvectors_1stage_random_OBJECTS) $(validate_complex_single_eigenvectors_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_random_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_random_all_layouts_LINK) $(validate_complex_single_eigenvectors_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_toeplitz$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_toeplitz_LINK) $(validate_complex_single_eigenvectors_1stage_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_random$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_random_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_random_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_random$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_random_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT): $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LINK) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_generalized_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_generalized_1stage_gpu_random$(EXEEXT): $(validate_complex_single_generalized_1stage_gpu_random_OBJECTS) $(validate_complex_single_generalized_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_single_generalized_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_generalized_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_generalized_1stage_gpu_random_LINK) $(validate_complex_single_generalized_1stage_gpu_random_OBJECTS) $(validate_complex_single_generalized_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_generalized_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_generalized_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_single_generalized_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_generalized_1stage_gpu_random_all_layouts_LINK) $(validate_complex_single_generalized_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_generalized_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_generalized_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_generalized_1stage_random$(EXEEXT): $(validate_complex_single_generalized_1stage_random_OBJECTS) $(validate_complex_single_generalized_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_single_generalized_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_generalized_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_generalized_1stage_random_LINK) $(validate_complex_single_generalized_1stage_random_OBJECTS) $(validate_complex_single_generalized_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_generalized_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_generalized_1stage_random_all_layouts$(EXEEXT): $(validate_complex_single_generalized_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_generalized_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_generalized_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_generalized_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_generalized_1stage_random_all_layouts_LINK) $(validate_complex_single_generalized_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_generalized_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_generalized_decomp_1stage_gpu_random$(EXEEXT): $(validate_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_complex_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_generalized_decomp_1stage_gpu_random_LINK) $(validate_complex_single_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_complex_single_generalized_decomp_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_LINK) $(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_generalized_decomp_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_generalized_decomp_1stage_random$(EXEEXT): $(validate_complex_single_generalized_decomp_1stage_random_OBJECTS) $(validate_complex_single_generalized_decomp_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_single_generalized_decomp_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_generalized_decomp_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_generalized_decomp_1stage_random_LINK) $(validate_complex_single_generalized_decomp_1stage_random_OBJECTS) $(validate_complex_single_generalized_decomp_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_generalized_decomp_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_generalized_decomp_1stage_random_all_layouts$(EXEEXT): $(validate_complex_single_generalized_decomp_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_generalized_decomp_1stage_random_all_layouts_LINK) $(validate_complex_single_generalized_decomp_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_generalized_decomp_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_hermitian_multiply_1stage_gpu_random$(EXEEXT): $(validate_complex_single_hermitian_multiply_1stage_gpu_random_OBJECTS) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_complex_single_hermitian_multiply_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_hermitian_multiply_1stage_gpu_random_LINK) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_OBJECTS) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_LINK) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_hermitian_multiply_1stage_random$(EXEEXT): $(validate_complex_single_hermitian_multiply_1stage_random_OBJECTS) $(validate_complex_single_hermitian_multiply_1stage_random_DEPENDENCIES) $(EXTRA_validate_complex_single_hermitian_multiply_1stage_random_DEPENDENCIES) 
+	@rm -f validate_complex_single_hermitian_multiply_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_hermitian_multiply_1stage_random_LINK) $(validate_complex_single_hermitian_multiply_1stage_random_OBJECTS) $(validate_complex_single_hermitian_multiply_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_complex_single_hermitian_multiply_1stage_random_all_layouts$(EXEEXT): $(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_complex_single_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_complex_single_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_LINK) $(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS) $(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/elpa2/validate_double_instance@SUFFIX@-double_instance.$(OBJEXT):  \
+	test/Fortran/elpa2/$(am__dirstamp) \
+	test/Fortran/elpa2/$(DEPDIR)/$(am__dirstamp)
+
+validate_double_instance@SUFFIX@$(EXEEXT): $(validate_double_instance@SUFFIX@_OBJECTS) $(validate_double_instance@SUFFIX@_DEPENDENCIES) $(EXTRA_validate_double_instance@SUFFIX@_DEPENDENCIES) 
+	@rm -f validate_double_instance@SUFFIX@$(EXEEXT)
+	$(AM_V_FCLD)$(validate_double_instance@SUFFIX@_LINK) $(validate_double_instance@SUFFIX@_OBJECTS) $(validate_double_instance@SUFFIX@_LDADD) $(LIBS)
+test/Fortran/validate_multiple_objs_real_double-test_multiple_objs.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_multiple_objs_real_double$(EXEEXT): $(validate_multiple_objs_real_double_OBJECTS) $(validate_multiple_objs_real_double_DEPENDENCIES) $(EXTRA_validate_multiple_objs_real_double_DEPENDENCIES) 
+	@rm -f validate_multiple_objs_real_double$(EXEEXT)
+	$(AM_V_FCLD)$(validate_multiple_objs_real_double_LINK) $(validate_multiple_objs_real_double_OBJECTS) $(validate_multiple_objs_real_double_LDADD) $(LIBS)
+test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.$(OBJEXT):  \
+	test/C/$(am__dirstamp) test/C/$(DEPDIR)/$(am__dirstamp)
+
+validate_multiple_objs_real_double_c_version$(EXEEXT): $(validate_multiple_objs_real_double_c_version_OBJECTS) $(validate_multiple_objs_real_double_c_version_DEPENDENCIES) $(EXTRA_validate_multiple_objs_real_double_c_version_DEPENDENCIES) 
+	@rm -f validate_multiple_objs_real_double_c_version$(EXEEXT)
+	$(AM_V_CCLD)$(validate_multiple_objs_real_double_c_version_LINK) $(validate_multiple_objs_real_double_c_version_OBJECTS) $(validate_multiple_objs_real_double_c_version_LDADD) $(LIBS)
+test/Fortran/elpa2/validate_real_2stage_banded@SUFFIX@-real_2stage_banded.$(OBJEXT):  \
+	test/Fortran/elpa2/$(am__dirstamp) \
+	test/Fortran/elpa2/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_2stage_banded@SUFFIX@$(EXEEXT): $(validate_real_2stage_banded@SUFFIX@_OBJECTS) $(validate_real_2stage_banded@SUFFIX@_DEPENDENCIES) $(EXTRA_validate_real_2stage_banded@SUFFIX@_DEPENDENCIES) 
+	@rm -f validate_real_2stage_banded@SUFFIX@$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_2stage_banded@SUFFIX@_LINK) $(validate_real_2stage_banded@SUFFIX@_OBJECTS) $(validate_real_2stage_banded@SUFFIX@_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_gpu_random$(EXEEXT): $(validate_real_double_cholesky_1stage_gpu_random_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_gpu_random_LINK) $(validate_real_double_cholesky_1stage_gpu_random_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_gpu_random_all_layouts_LINK) $(validate_real_double_cholesky_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random_split_comm_myself-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_gpu_random_split_comm_myself$(EXEEXT): $(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_gpu_random_split_comm_myself$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_LINK) $(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_gpu_toeplitz$(EXEEXT): $(validate_real_double_cholesky_1stage_gpu_toeplitz_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_gpu_toeplitz_LINK) $(validate_real_double_cholesky_1stage_gpu_toeplitz_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_random$(EXEEXT): $(validate_real_double_cholesky_1stage_random_OBJECTS) $(validate_real_double_cholesky_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_random_LINK) $(validate_real_double_cholesky_1stage_random_OBJECTS) $(validate_real_double_cholesky_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_random_all_layouts$(EXEEXT): $(validate_real_double_cholesky_1stage_random_all_layouts_OBJECTS) $(validate_real_double_cholesky_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_random_all_layouts_LINK) $(validate_real_double_cholesky_1stage_random_all_layouts_OBJECTS) $(validate_real_double_cholesky_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_random_split_comm_myself-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_random_split_comm_myself$(EXEEXT): $(validate_real_double_cholesky_1stage_random_split_comm_myself_OBJECTS) $(validate_real_double_cholesky_1stage_random_split_comm_myself_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_random_split_comm_myself_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_random_split_comm_myself$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_random_split_comm_myself_LINK) $(validate_real_double_cholesky_1stage_random_split_comm_myself_OBJECTS) $(validate_real_double_cholesky_1stage_random_split_comm_myself_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_toeplitz$(EXEEXT): $(validate_real_double_cholesky_1stage_toeplitz_OBJECTS) $(validate_real_double_cholesky_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_toeplitz_LINK) $(validate_real_double_cholesky_1stage_toeplitz_OBJECTS) $(validate_real_double_cholesky_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_cholesky_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_cholesky_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_double_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_cholesky_1stage_toeplitz_all_layouts_LINK) $(validate_real_double_cholesky_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_double_cholesky_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_1stage_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_1stage_frank$(EXEEXT): $(validate_real_double_eigenvalues_1stage_frank_OBJECTS) $(validate_real_double_eigenvalues_1stage_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_1stage_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_1stage_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_1stage_frank_LINK) $(validate_real_double_eigenvalues_1stage_frank_OBJECTS) $(validate_real_double_eigenvalues_1stage_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_1stage_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_1stage_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvalues_1stage_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_1stage_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_1stage_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_1stage_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_1stage_frank_all_layouts_LINK) $(validate_real_double_eigenvalues_1stage_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_1stage_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_1stage_gpu_frank$(EXEEXT): $(validate_real_double_eigenvalues_1stage_gpu_frank_OBJECTS) $(validate_real_double_eigenvalues_1stage_gpu_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_1stage_gpu_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_1stage_gpu_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_1stage_gpu_frank_LINK) $(validate_real_double_eigenvalues_1stage_gpu_frank_OBJECTS) $(validate_real_double_eigenvalues_1stage_gpu_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_LINK) $(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_1stage_gpu_toeplitz$(EXEEXT): $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_1stage_gpu_toeplitz_LINK) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_1stage_toeplitz$(EXEEXT): $(validate_real_double_eigenvalues_1stage_toeplitz_OBJECTS) $(validate_real_double_eigenvalues_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_1stage_toeplitz_LINK) $(validate_real_double_eigenvalues_1stage_toeplitz_OBJECTS) $(validate_real_double_eigenvalues_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_2stage_default_kernel_frank$(EXEEXT): $(validate_real_double_eigenvalues_2stage_default_kernel_frank_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_2stage_default_kernel_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_2stage_default_kernel_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_2stage_default_kernel_frank_LINK) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_LINK) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank$(EXEEXT): $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_LINK) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_LINK) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT): $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LINK) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT): $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_LINK) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_analytic$(EXEEXT): $(validate_real_double_eigenvectors_1stage_analytic_OBJECTS) $(validate_real_double_eigenvectors_1stage_analytic_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_analytic_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_analytic_LINK) $(validate_real_double_eigenvectors_1stage_analytic_OBJECTS) $(validate_real_double_eigenvectors_1stage_analytic_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_analytic_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_analytic_all_layouts_LINK) $(validate_real_double_eigenvectors_1stage_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_frank$(EXEEXT): $(validate_real_double_eigenvectors_1stage_frank_OBJECTS) $(validate_real_double_eigenvectors_1stage_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_frank_LINK) $(validate_real_double_eigenvectors_1stage_frank_OBJECTS) $(validate_real_double_eigenvectors_1stage_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_1stage_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_frank_all_layouts_LINK) $(validate_real_double_eigenvectors_1stage_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_gpu_frank$(EXEEXT): $(validate_real_double_eigenvectors_1stage_gpu_frank_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_gpu_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_gpu_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_gpu_frank_LINK) $(validate_real_double_eigenvectors_1stage_gpu_frank_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_LINK) $(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_gpu_random$(EXEEXT): $(validate_real_double_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_gpu_random_LINK) $(validate_real_double_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_LINK) $(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself$(EXEEXT): $(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_LINK) $(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_gpu_toeplitz$(EXEEXT): $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_gpu_toeplitz_LINK) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_random$(EXEEXT): $(validate_real_double_eigenvectors_1stage_random_OBJECTS) $(validate_real_double_eigenvectors_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_random_LINK) $(validate_real_double_eigenvectors_1stage_random_OBJECTS) $(validate_real_double_eigenvectors_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_random_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_1stage_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_random_all_layouts_LINK) $(validate_real_double_eigenvectors_1stage_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_random_split_comm_myself-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_random_split_comm_myself$(EXEEXT): $(validate_real_double_eigenvectors_1stage_random_split_comm_myself_OBJECTS) $(validate_real_double_eigenvectors_1stage_random_split_comm_myself_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_random_split_comm_myself_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_random_split_comm_myself$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_random_split_comm_myself_LINK) $(validate_real_double_eigenvectors_1stage_random_split_comm_myself_OBJECTS) $(validate_real_double_eigenvectors_1stage_random_split_comm_myself_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_toeplitz$(EXEEXT): $(validate_real_double_eigenvectors_1stage_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_toeplitz_LINK) $(validate_real_double_eigenvectors_1stage_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_analytic$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_analytic_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_frank$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_frank_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_frank_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_qr_random_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_qr_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_random$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_random_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_random_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_random_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_analytic$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_analytic_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_frank$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_frank_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_frank_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_qr_random_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_qr_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_random$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_random_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LINK) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_scalapack_all_analytic$(EXEEXT): $(validate_real_double_eigenvectors_scalapack_all_analytic_OBJECTS) $(validate_real_double_eigenvectors_scalapack_all_analytic_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_scalapack_all_analytic_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_scalapack_all_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_scalapack_all_analytic_LINK) $(validate_real_double_eigenvectors_scalapack_all_analytic_OBJECTS) $(validate_real_double_eigenvectors_scalapack_all_analytic_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_LINK) $(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_scalapack_part_analytic$(EXEEXT): $(validate_real_double_eigenvectors_scalapack_part_analytic_OBJECTS) $(validate_real_double_eigenvectors_scalapack_part_analytic_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_scalapack_part_analytic_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_scalapack_part_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_scalapack_part_analytic_LINK) $(validate_real_double_eigenvectors_scalapack_part_analytic_OBJECTS) $(validate_real_double_eigenvectors_scalapack_part_analytic_LDADD) $(LIBS)
+test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts$(EXEEXT): $(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_LINK) $(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_OBJECTS) $(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_generalized_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_generalized_1stage_gpu_random$(EXEEXT): $(validate_real_double_generalized_1stage_gpu_random_OBJECTS) $(validate_real_double_generalized_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_double_generalized_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_double_generalized_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_generalized_1stage_gpu_random_LINK) $(validate_real_double_generalized_1stage_gpu_random_OBJECTS) $(validate_real_double_generalized_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_generalized_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_generalized_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_double_generalized_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_generalized_1stage_gpu_random_all_layouts_LINK) $(validate_real_double_generalized_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_generalized_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_generalized_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_generalized_1stage_random$(EXEEXT): $(validate_real_double_generalized_1stage_random_OBJECTS) $(validate_real_double_generalized_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_double_generalized_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_double_generalized_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_generalized_1stage_random_LINK) $(validate_real_double_generalized_1stage_random_OBJECTS) $(validate_real_double_generalized_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_generalized_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_generalized_1stage_random_all_layouts$(EXEEXT): $(validate_real_double_generalized_1stage_random_all_layouts_OBJECTS) $(validate_real_double_generalized_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_generalized_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_generalized_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_generalized_1stage_random_all_layouts_LINK) $(validate_real_double_generalized_1stage_random_all_layouts_OBJECTS) $(validate_real_double_generalized_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_generalized_decomp_1stage_gpu_random$(EXEEXT): $(validate_real_double_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_real_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_double_generalized_decomp_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_double_generalized_decomp_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_generalized_decomp_1stage_gpu_random_LINK) $(validate_real_double_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_real_double_generalized_decomp_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_LINK) $(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_generalized_decomp_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_generalized_decomp_1stage_random$(EXEEXT): $(validate_real_double_generalized_decomp_1stage_random_OBJECTS) $(validate_real_double_generalized_decomp_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_double_generalized_decomp_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_double_generalized_decomp_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_generalized_decomp_1stage_random_LINK) $(validate_real_double_generalized_decomp_1stage_random_OBJECTS) $(validate_real_double_generalized_decomp_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_generalized_decomp_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_generalized_decomp_1stage_random_all_layouts$(EXEEXT): $(validate_real_double_generalized_decomp_1stage_random_all_layouts_OBJECTS) $(validate_real_double_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_generalized_decomp_1stage_random_all_layouts_LINK) $(validate_real_double_generalized_decomp_1stage_random_all_layouts_OBJECTS) $(validate_real_double_generalized_decomp_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_hermitian_multiply_1stage_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_hermitian_multiply_1stage_frank$(EXEEXT): $(validate_real_double_hermitian_multiply_1stage_frank_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_frank_DEPENDENCIES) $(EXTRA_validate_real_double_hermitian_multiply_1stage_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_hermitian_multiply_1stage_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_hermitian_multiply_1stage_frank_LINK) $(validate_real_double_hermitian_multiply_1stage_frank_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_hermitian_multiply_1stage_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_hermitian_multiply_1stage_frank_all_layouts$(EXEEXT): $(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_hermitian_multiply_1stage_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_hermitian_multiply_1stage_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_LINK) $(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_hermitian_multiply_1stage_gpu_frank$(EXEEXT): $(validate_real_double_hermitian_multiply_1stage_gpu_frank_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_DEPENDENCIES) $(EXTRA_validate_real_double_hermitian_multiply_1stage_gpu_frank_DEPENDENCIES) 
+	@rm -f validate_real_double_hermitian_multiply_1stage_gpu_frank$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_hermitian_multiply_1stage_gpu_frank_LINK) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_LDADD) $(LIBS)
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts$(EXEEXT): $(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_LINK) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_hermitian_multiply_1stage_gpu_random$(EXEEXT): $(validate_real_double_hermitian_multiply_1stage_gpu_random_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_double_hermitian_multiply_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_double_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_hermitian_multiply_1stage_gpu_random_LINK) $(validate_real_double_hermitian_multiply_1stage_gpu_random_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_LINK) $(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_hermitian_multiply_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_hermitian_multiply_1stage_random$(EXEEXT): $(validate_real_double_hermitian_multiply_1stage_random_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_double_hermitian_multiply_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_double_hermitian_multiply_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_hermitian_multiply_1stage_random_LINK) $(validate_real_double_hermitian_multiply_1stage_random_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_double_hermitian_multiply_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_hermitian_multiply_1stage_random_all_layouts$(EXEEXT): $(validate_real_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_hermitian_multiply_1stage_random_all_layouts_LINK) $(validate_real_double_hermitian_multiply_1stage_random_all_layouts_OBJECTS) $(validate_real_double_hermitian_multiply_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz$(EXEEXT): $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_LINK) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_solve_tridiagonal_1stage_toeplitz$(EXEEXT): $(validate_real_double_solve_tridiagonal_1stage_toeplitz_OBJECTS) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_double_solve_tridiagonal_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_double_solve_tridiagonal_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_solve_tridiagonal_1stage_toeplitz_LINK) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_OBJECTS) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_LINK) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_cholesky_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_cholesky_1stage_gpu_random$(EXEEXT): $(validate_real_single_cholesky_1stage_gpu_random_OBJECTS) $(validate_real_single_cholesky_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_single_cholesky_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_single_cholesky_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_cholesky_1stage_gpu_random_LINK) $(validate_real_single_cholesky_1stage_gpu_random_OBJECTS) $(validate_real_single_cholesky_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_cholesky_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_cholesky_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_cholesky_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_cholesky_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_cholesky_1stage_gpu_random_all_layouts_LINK) $(validate_real_single_cholesky_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_cholesky_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_cholesky_1stage_gpu_toeplitz$(EXEEXT): $(validate_real_single_cholesky_1stage_gpu_toeplitz_OBJECTS) $(validate_real_single_cholesky_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_cholesky_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_cholesky_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_cholesky_1stage_gpu_toeplitz_LINK) $(validate_real_single_cholesky_1stage_gpu_toeplitz_OBJECTS) $(validate_real_single_cholesky_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_cholesky_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_cholesky_1stage_random$(EXEEXT): $(validate_real_single_cholesky_1stage_random_OBJECTS) $(validate_real_single_cholesky_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_single_cholesky_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_single_cholesky_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_cholesky_1stage_random_LINK) $(validate_real_single_cholesky_1stage_random_OBJECTS) $(validate_real_single_cholesky_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_cholesky_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_cholesky_1stage_random_all_layouts$(EXEEXT): $(validate_real_single_cholesky_1stage_random_all_layouts_OBJECTS) $(validate_real_single_cholesky_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_cholesky_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_cholesky_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_cholesky_1stage_random_all_layouts_LINK) $(validate_real_single_cholesky_1stage_random_all_layouts_OBJECTS) $(validate_real_single_cholesky_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_cholesky_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_cholesky_1stage_toeplitz$(EXEEXT): $(validate_real_single_cholesky_1stage_toeplitz_OBJECTS) $(validate_real_single_cholesky_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_cholesky_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_cholesky_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_cholesky_1stage_toeplitz_LINK) $(validate_real_single_cholesky_1stage_toeplitz_OBJECTS) $(validate_real_single_cholesky_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_cholesky_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_cholesky_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_single_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_cholesky_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_cholesky_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_cholesky_1stage_toeplitz_all_layouts_LINK) $(validate_real_single_cholesky_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_single_cholesky_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvalues_1stage_gpu_toeplitz$(EXEEXT): $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvalues_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvalues_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvalues_1stage_gpu_toeplitz_LINK) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvalues_1stage_toeplitz$(EXEEXT): $(validate_real_single_eigenvalues_1stage_toeplitz_OBJECTS) $(validate_real_single_eigenvalues_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvalues_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvalues_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvalues_1stage_toeplitz_LINK) $(validate_real_single_eigenvalues_1stage_toeplitz_OBJECTS) $(validate_real_single_eigenvalues_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvalues_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT): $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LINK) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT): $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvalues_2stage_default_kernel_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_LINK) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_OBJECTS) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_analytic$(EXEEXT): $(validate_real_single_eigenvectors_1stage_analytic_OBJECTS) $(validate_real_single_eigenvectors_1stage_analytic_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_analytic_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_analytic_LINK) $(validate_real_single_eigenvectors_1stage_analytic_OBJECTS) $(validate_real_single_eigenvectors_1stage_analytic_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_analytic_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_analytic_all_layouts_LINK) $(validate_real_single_eigenvectors_1stage_analytic_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_gpu_random$(EXEEXT): $(validate_real_single_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_real_single_eigenvectors_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_gpu_random_LINK) $(validate_real_single_eigenvectors_1stage_gpu_random_OBJECTS) $(validate_real_single_eigenvectors_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_LINK) $(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_gpu_toeplitz$(EXEEXT): $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_gpu_toeplitz_LINK) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_random$(EXEEXT): $(validate_real_single_eigenvectors_1stage_random_OBJECTS) $(validate_real_single_eigenvectors_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_random_LINK) $(validate_real_single_eigenvectors_1stage_random_OBJECTS) $(validate_real_single_eigenvectors_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_random_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_1stage_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_random_all_layouts_LINK) $(validate_real_single_eigenvectors_1stage_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_toeplitz$(EXEEXT): $(validate_real_single_eigenvectors_1stage_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_toeplitz_LINK) $(validate_real_single_eigenvectors_1stage_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_analytic$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_analytic_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_analytic_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_qr_random_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_qr_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_random$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_random_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_random_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_random_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_analytic$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_analytic_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_analytic$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_analytic_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_qr_random_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_qr_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_random$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_random_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_random_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_random_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LINK) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_OBJECTS) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_generalized_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_generalized_1stage_gpu_random$(EXEEXT): $(validate_real_single_generalized_1stage_gpu_random_OBJECTS) $(validate_real_single_generalized_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_single_generalized_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_single_generalized_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_generalized_1stage_gpu_random_LINK) $(validate_real_single_generalized_1stage_gpu_random_OBJECTS) $(validate_real_single_generalized_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_generalized_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_generalized_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_single_generalized_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_generalized_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_generalized_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_generalized_1stage_gpu_random_all_layouts_LINK) $(validate_real_single_generalized_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_generalized_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_generalized_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_generalized_1stage_random$(EXEEXT): $(validate_real_single_generalized_1stage_random_OBJECTS) $(validate_real_single_generalized_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_single_generalized_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_single_generalized_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_generalized_1stage_random_LINK) $(validate_real_single_generalized_1stage_random_OBJECTS) $(validate_real_single_generalized_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_generalized_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_generalized_1stage_random_all_layouts$(EXEEXT): $(validate_real_single_generalized_1stage_random_all_layouts_OBJECTS) $(validate_real_single_generalized_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_generalized_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_generalized_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_generalized_1stage_random_all_layouts_LINK) $(validate_real_single_generalized_1stage_random_all_layouts_OBJECTS) $(validate_real_single_generalized_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_generalized_decomp_1stage_gpu_random$(EXEEXT): $(validate_real_single_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_real_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_single_generalized_decomp_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_single_generalized_decomp_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_generalized_decomp_1stage_gpu_random_LINK) $(validate_real_single_generalized_decomp_1stage_gpu_random_OBJECTS) $(validate_real_single_generalized_decomp_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_LINK) $(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_generalized_decomp_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_generalized_decomp_1stage_random$(EXEEXT): $(validate_real_single_generalized_decomp_1stage_random_OBJECTS) $(validate_real_single_generalized_decomp_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_single_generalized_decomp_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_single_generalized_decomp_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_generalized_decomp_1stage_random_LINK) $(validate_real_single_generalized_decomp_1stage_random_OBJECTS) $(validate_real_single_generalized_decomp_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_generalized_decomp_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_generalized_decomp_1stage_random_all_layouts$(EXEEXT): $(validate_real_single_generalized_decomp_1stage_random_all_layouts_OBJECTS) $(validate_real_single_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_generalized_decomp_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_generalized_decomp_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_generalized_decomp_1stage_random_all_layouts_LINK) $(validate_real_single_generalized_decomp_1stage_random_all_layouts_OBJECTS) $(validate_real_single_generalized_decomp_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_hermitian_multiply_1stage_gpu_random$(EXEEXT): $(validate_real_single_hermitian_multiply_1stage_gpu_random_OBJECTS) $(validate_real_single_hermitian_multiply_1stage_gpu_random_DEPENDENCIES) $(EXTRA_validate_real_single_hermitian_multiply_1stage_gpu_random_DEPENDENCIES) 
+	@rm -f validate_real_single_hermitian_multiply_1stage_gpu_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_hermitian_multiply_1stage_gpu_random_LINK) $(validate_real_single_hermitian_multiply_1stage_gpu_random_OBJECTS) $(validate_real_single_hermitian_multiply_1stage_gpu_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT): $(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_LINK) $(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_OBJECTS) $(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_hermitian_multiply_1stage_random-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_hermitian_multiply_1stage_random$(EXEEXT): $(validate_real_single_hermitian_multiply_1stage_random_OBJECTS) $(validate_real_single_hermitian_multiply_1stage_random_DEPENDENCIES) $(EXTRA_validate_real_single_hermitian_multiply_1stage_random_DEPENDENCIES) 
+	@rm -f validate_real_single_hermitian_multiply_1stage_random$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_hermitian_multiply_1stage_random_LINK) $(validate_real_single_hermitian_multiply_1stage_random_OBJECTS) $(validate_real_single_hermitian_multiply_1stage_random_LDADD) $(LIBS)
+test/Fortran/validate_real_single_hermitian_multiply_1stage_random_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_hermitian_multiply_1stage_random_all_layouts$(EXEEXT): $(validate_real_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS) $(validate_real_single_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_hermitian_multiply_1stage_random_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_hermitian_multiply_1stage_random_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_hermitian_multiply_1stage_random_all_layouts_LINK) $(validate_real_single_hermitian_multiply_1stage_random_all_layouts_OBJECTS) $(validate_real_single_hermitian_multiply_1stage_random_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz$(EXEEXT): $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_LINK) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_OBJECTS) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LINK) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_OBJECTS) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_solve_tridiagonal_1stage_toeplitz$(EXEEXT): $(validate_real_single_solve_tridiagonal_1stage_toeplitz_OBJECTS) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_DEPENDENCIES) $(EXTRA_validate_real_single_solve_tridiagonal_1stage_toeplitz_DEPENDENCIES) 
+	@rm -f validate_real_single_solve_tridiagonal_1stage_toeplitz$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_solve_tridiagonal_1stage_toeplitz_LINK) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_OBJECTS) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_LDADD) $(LIBS)
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts-test.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts$(EXEEXT): $(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_DEPENDENCIES) $(EXTRA_validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_DEPENDENCIES) 
+	@rm -f validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts$(EXEEXT)
+	$(AM_V_FCLD)$(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_LINK) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_OBJECTS) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_LDADD) $(LIBS)
+test/Fortran/elpa2/validate_single_complex_2stage_banded@SUFFIX@-single_complex_2stage_banded.$(OBJEXT):  \
+	test/Fortran/elpa2/$(am__dirstamp) \
+	test/Fortran/elpa2/$(DEPDIR)/$(am__dirstamp)
+
+validate_single_complex_2stage_banded@SUFFIX@$(EXEEXT): $(validate_single_complex_2stage_banded@SUFFIX@_OBJECTS) $(validate_single_complex_2stage_banded@SUFFIX@_DEPENDENCIES) $(EXTRA_validate_single_complex_2stage_banded@SUFFIX@_DEPENDENCIES) 
+	@rm -f validate_single_complex_2stage_banded@SUFFIX@$(EXEEXT)
+	$(AM_V_FCLD)$(validate_single_complex_2stage_banded@SUFFIX@_LINK) $(validate_single_complex_2stage_banded@SUFFIX@_OBJECTS) $(validate_single_complex_2stage_banded@SUFFIX@_LDADD) $(LIBS)
+test/Fortran/elpa2/validate_single_real_2stage_banded@SUFFIX@-single_real_2stage_banded.$(OBJEXT):  \
+	test/Fortran/elpa2/$(am__dirstamp) \
+	test/Fortran/elpa2/$(DEPDIR)/$(am__dirstamp)
+
+validate_single_real_2stage_banded@SUFFIX@$(EXEEXT): $(validate_single_real_2stage_banded@SUFFIX@_OBJECTS) $(validate_single_real_2stage_banded@SUFFIX@_DEPENDENCIES) $(EXTRA_validate_single_real_2stage_banded@SUFFIX@_DEPENDENCIES) 
+	@rm -f validate_single_real_2stage_banded@SUFFIX@$(EXEEXT)
+	$(AM_V_FCLD)$(validate_single_real_2stage_banded@SUFFIX@_LINK) $(validate_single_real_2stage_banded@SUFFIX@_OBJECTS) $(validate_single_real_2stage_banded@SUFFIX@_LDADD) $(LIBS)
+test/Fortran/validate_split_comm_real_double-test_split_comm.$(OBJEXT):  \
+	test/Fortran/$(am__dirstamp) \
+	test/Fortran/$(DEPDIR)/$(am__dirstamp)
+
+validate_split_comm_real_double$(EXEEXT): $(validate_split_comm_real_double_OBJECTS) $(validate_split_comm_real_double_DEPENDENCIES) $(EXTRA_validate_split_comm_real_double_DEPENDENCIES) 
+	@rm -f validate_split_comm_real_double$(EXEEXT)
+	$(AM_V_FCLD)$(validate_split_comm_real_double_LINK) $(validate_split_comm_real_double_OBJECTS) $(validate_split_comm_real_double_LDADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
+	-rm -f python/pyelpa/*.$(OBJEXT)
+	-rm -f python/pyelpa/*.lo
 	-rm -f src/*.$(OBJEXT)
 	-rm -f src/*.lo
-	-rm -f src/elpa2_kernels/*.$(OBJEXT)
-	-rm -f src/elpa2_kernels/*.lo
-	-rm -f src/elpa_qr/*.$(OBJEXT)
-	-rm -f src/elpa_qr/*.lo
+	-rm -f src/GPU/*.$(OBJEXT)
+	-rm -f src/GPU/*.lo
+	-rm -f src/elpa1/*.$(OBJEXT)
+	-rm -f src/elpa1/*.lo
+	-rm -f src/elpa2/*.$(OBJEXT)
+	-rm -f src/elpa2/*.lo
+	-rm -f src/elpa2/GPU/*.$(OBJEXT)
+	-rm -f src/elpa2/GPU/*.lo
+	-rm -f src/elpa2/kernels/*.$(OBJEXT)
+	-rm -f src/elpa2/kernels/*.lo
+	-rm -f src/elpa2/qr/*.$(OBJEXT)
+	-rm -f src/elpa2/qr/*.lo
+	-rm -f src/elpa_generalized/*.$(OBJEXT)
+	-rm -f src/elpa_generalized/*.lo
 	-rm -f src/ftimings/*.$(OBJEXT)
 	-rm -f src/ftimings/*.lo
-	-rm -f test/c_test_programs/*.$(OBJEXT)
-	-rm -f test/fortran_test_programs/*.$(OBJEXT)
-	-rm -f test/shared_sources/*.$(OBJEXT)
+	-rm -f src/general/*.$(OBJEXT)
+	-rm -f src/general/*.lo
+	-rm -f src/helpers/*.$(OBJEXT)
+	-rm -f src/helpers/*.lo
+	-rm -f test/C/*.$(OBJEXT)
+	-rm -f test/Fortran/*.$(OBJEXT)
+	-rm -f test/Fortran/elpa2/*.$(OBJEXT)
+	-rm -f test/shared/*.$(OBJEXT)
+	-rm -f test/shared/*.lo
 
 distclean-compile:
 	-rm -f *.tab.c
 
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_complex_avx-avx2_1hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_complex_avx-avx2_2hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_complex_sse_1hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_complex_sse_2hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_avx-avx2_2hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_avx-avx2_4hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_avx-avx2_6hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_sse_2hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_sse_4hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_sse_6hv.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/highwater_mark.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/papi.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/resident_set_size.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/time.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/virtual_memory.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@test/c_test_programs/$(DEPDIR)/elpa1_test_complex_c_version.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@test/c_test_programs/$(DEPDIR)/elpa1_test_real_c_version.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@test/c_test_programs/$(DEPDIR)/elpa2_test_complex_c_version.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@test/c_test_programs/$(DEPDIR)/elpa2_test_real_c_version.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@test/shared_sources/$(DEPDIR)/call_elpa1.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@test/shared_sources/$(DEPDIR)/redir.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@python/pyelpa/$(DEPDIR)/wrapper_la-wrapper.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/elpa_c_interface.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/$(DEPDIR)/elpa_index.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_1hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_1hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_2hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_avx512_1hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_avx512_1hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_avx512_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_avx512_2hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_sse_1hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_sse_1hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_sse_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/complex_sse_2hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_2hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_4hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_4hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_6hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_6hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx512_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx512_2hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx512_4hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx512_4hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx512_6hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_avx512_6hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_2hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_4hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_4hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_6hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_6hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sparc64_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sparc64_4hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sparc64_6hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sse_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sse_2hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sse_4hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sse_4hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sse_6hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_sse_6hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_vsx_2hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_vsx_2hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_vsx_4hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_vsx_4hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_vsx_6hv_double_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa2/kernels/$(DEPDIR)/real_vsx_6hv_single_precision.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/elpa_generalized/$(DEPDIR)/cannon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/highwater_mark.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/papi.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/resident_set_size.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/time.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/virtual_memory.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/helpers/$(DEPDIR)/get_cpuid_set.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@src/helpers/$(DEPDIR)/print_build_config.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@test/shared/$(DEPDIR)/test_redir.Plo@am__quote@ # am--include-marker
+
+$(am__depfiles_remade):
+	@$(MKDIR_P) $(@D)
+	@echo '# dummy' >$@-t && $(am__mv) $@-t $@
+
+am--depfiles: $(am__depfiles_remade)
 
 .F90.o:
 	$(AM_V_PPFC)$(PPFCCOMPILE) -c -o $@ $<
@@ -1730,535 +14987,5355 @@
 .F90.lo:
 	$(AM_V_PPFC)$(LTPPFCCOMPILE) -c -o $@ $<
 
-.c.o:
-@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
-@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+src/libelpa@SUFFIX@_private_la-elpa_impl.lo: src/elpa_impl.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/libelpa@SUFFIX@_private_la-elpa_impl.lo `test -f 'src/elpa_impl.F90' || echo '$(srcdir)/'`src/elpa_impl.F90
 
-.c.obj:
-@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
-@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+src/libelpa@SUFFIX@_private_la-elpa_autotune_impl.lo: src/elpa_autotune_impl.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/libelpa@SUFFIX@_private_la-elpa_autotune_impl.lo `test -f 'src/elpa_autotune_impl.F90' || echo '$(srcdir)/'`src/elpa_autotune_impl.F90
 
-.c.lo:
-@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
-@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
-@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+src/libelpa@SUFFIX@_private_la-elpa_abstract_impl.lo: src/elpa_abstract_impl.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/libelpa@SUFFIX@_private_la-elpa_abstract_impl.lo `test -f 'src/elpa_abstract_impl.F90' || echo '$(srcdir)/'`src/elpa_abstract_impl.F90
 
-.f90.o:
-	$(AM_V_FC)$(FCCOMPILE) -c -o $@ $<
+src/helpers/libelpa@SUFFIX@_private_la-mod_precision.lo: src/helpers/mod_precision.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-mod_precision.lo `test -f 'src/helpers/mod_precision.F90' || echo '$(srcdir)/'`src/helpers/mod_precision.F90
 
-.f90.obj:
-	$(AM_V_FC)$(FCCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+src/helpers/libelpa@SUFFIX@_private_la-mod_blas_interfaces.lo: src/helpers/mod_blas_interfaces.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-mod_blas_interfaces.lo `test -f 'src/helpers/mod_blas_interfaces.F90' || echo '$(srcdir)/'`src/helpers/mod_blas_interfaces.F90
 
-.f90.lo:
-	$(AM_V_FC)$(LTFCCOMPILE) -c -o $@ $<
+src/helpers/libelpa@SUFFIX@_private_la-mod_scalapack_interfaces.lo: src/helpers/mod_scalapack_interfaces.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-mod_scalapack_interfaces.lo `test -f 'src/helpers/mod_scalapack_interfaces.F90' || echo '$(srcdir)/'`src/helpers/mod_scalapack_interfaces.F90
 
-.s.o:
-	$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ $<
+src/helpers/libelpa@SUFFIX@_private_la-mod_mpi.lo: src/helpers/mod_mpi.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-mod_mpi.lo `test -f 'src/helpers/mod_mpi.F90' || echo '$(srcdir)/'`src/helpers/mod_mpi.F90
 
-.s.obj:
-	$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+src/helpers/libelpa@SUFFIX@_private_la-mod_mpi_stubs.lo: src/helpers/mod_mpi_stubs.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-mod_mpi_stubs.lo `test -f 'src/helpers/mod_mpi_stubs.F90' || echo '$(srcdir)/'`src/helpers/mod_mpi_stubs.F90
 
-.s.lo:
-	$(AM_V_CCAS)$(LTCCASCOMPILE) -c -o $@ $<
+src/helpers/libelpa@SUFFIX@_private_la-mod_omp.lo: src/helpers/mod_omp.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-mod_omp.lo `test -f 'src/helpers/mod_omp.F90' || echo '$(srcdir)/'`src/helpers/mod_omp.F90
 
-mostlyclean-libtool:
-	-rm -f *.lo
+src/libelpa@SUFFIX@_private_la-elpa_generated_fortran_interfaces.lo: src/elpa_generated_fortran_interfaces.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/libelpa@SUFFIX@_private_la-elpa_generated_fortran_interfaces.lo `test -f 'src/elpa_generated_fortran_interfaces.F90' || echo '$(srcdir)/'`src/elpa_generated_fortran_interfaces.F90
 
-clean-libtool:
-	-rm -rf .libs _libs
-	-rm -rf src/.libs src/_libs
-	-rm -rf src/elpa2_kernels/.libs src/elpa2_kernels/_libs
-	-rm -rf src/elpa_qr/.libs src/elpa_qr/_libs
-	-rm -rf src/ftimings/.libs src/ftimings/_libs
+src/elpa2/libelpa@SUFFIX@_private_la-mod_redist_band.lo: src/elpa2/mod_redist_band.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/libelpa@SUFFIX@_private_la-mod_redist_band.lo `test -f 'src/elpa2/mod_redist_band.F90' || echo '$(srcdir)/'`src/elpa2/mod_redist_band.F90
 
-distclean-libtool:
-	-rm -f libtool config.lt
-install-man1: $(dist_man_MANS)
-	@$(NORMAL_INSTALL)
-	@list1=''; \
-	list2='$(dist_man_MANS)'; \
-	test -n "$(man1dir)" \
-	  && test -n "`echo $$list1$$list2`" \
-	  || exit 0; \
-	echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \
-	$(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \
-	{ for i in $$list1; do echo "$$i"; done;  \
-	if test -n "$$list2"; then \
-	  for i in $$list2; do echo "$$i"; done \
-	    | sed -n '/\.1[a-z]*$$/p'; \
-	fi; \
-	} | while read p; do \
-	  if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; echo "$$p"; \
-	done | \
-	sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
-	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \
-	sed 'N;N;s,\n, ,g' | { \
-	list=; while read file base inst; do \
-	  if test "$$base" = "$$inst"; then list="$$list $$file"; else \
-	    echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \
-	    $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \
-	  fi; \
-	done; \
-	for i in $$list; do echo "$$i"; done | $(am__base_list) | \
-	while read files; do \
-	  test -z "$$files" || { \
-	    echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \
-	    $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \
-	done; }
+src/elpa2/libelpa@SUFFIX@_private_la-mod_pack_unpack_cpu.lo: src/elpa2/mod_pack_unpack_cpu.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/libelpa@SUFFIX@_private_la-mod_pack_unpack_cpu.lo `test -f 'src/elpa2/mod_pack_unpack_cpu.F90' || echo '$(srcdir)/'`src/elpa2/mod_pack_unpack_cpu.F90
 
-uninstall-man1:
-	@$(NORMAL_UNINSTALL)
-	@list=''; test -n "$(man1dir)" || exit 0; \
-	files=`{ for i in $$list; do echo "$$i"; done; \
-	l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \
-	  sed -n '/\.1[a-z]*$$/p'; \
-	} | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
-	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
-	dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir)
-install-man3: $(dist_man_MANS)
-	@$(NORMAL_INSTALL)
-	@list1=''; \
-	list2='$(dist_man_MANS)'; \
-	test -n "$(man3dir)" \
-	  && test -n "`echo $$list1$$list2`" \
-	  || exit 0; \
-	echo " $(MKDIR_P) '$(DESTDIR)$(man3dir)'"; \
-	$(MKDIR_P) "$(DESTDIR)$(man3dir)" || exit 1; \
-	{ for i in $$list1; do echo "$$i"; done;  \
-	if test -n "$$list2"; then \
-	  for i in $$list2; do echo "$$i"; done \
-	    | sed -n '/\.3[a-z]*$$/p'; \
-	fi; \
-	} | while read p; do \
-	  if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; echo "$$p"; \
-	done | \
-	sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^3][0-9a-z]*$$,3,;x' \
-	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \
-	sed 'N;N;s,\n, ,g' | { \
-	list=; while read file base inst; do \
-	  if test "$$base" = "$$inst"; then list="$$list $$file"; else \
-	    echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man3dir)/$$inst'"; \
-	    $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man3dir)/$$inst" || exit $$?; \
-	  fi; \
-	done; \
-	for i in $$list; do echo "$$i"; done | $(am__base_list) | \
-	while read files; do \
-	  test -z "$$files" || { \
-	    echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man3dir)'"; \
-	    $(INSTALL_DATA) $$files "$(DESTDIR)$(man3dir)" || exit $$?; }; \
-	done; }
+src/elpa2/libelpa@SUFFIX@_private_la-mod_compute_hh_trafo.lo: src/elpa2/mod_compute_hh_trafo.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/libelpa@SUFFIX@_private_la-mod_compute_hh_trafo.lo `test -f 'src/elpa2/mod_compute_hh_trafo.F90' || echo '$(srcdir)/'`src/elpa2/mod_compute_hh_trafo.F90
 
-uninstall-man3:
-	@$(NORMAL_UNINSTALL)
-	@list=''; test -n "$(man3dir)" || exit 0; \
-	files=`{ for i in $$list; do echo "$$i"; done; \
-	l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \
-	  sed -n '/\.3[a-z]*$$/p'; \
-	} | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^3][0-9a-z]*$$,3,;x' \
-	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
-	dir='$(DESTDIR)$(man3dir)'; $(am__uninstall_files_from_dir)
-install-dist_docDATA: $(dist_doc_DATA)
-	@$(NORMAL_INSTALL)
-	@list='$(dist_doc_DATA)'; test -n "$(docdir)" || list=; \
-	if test -n "$$list"; then \
-	  echo " $(MKDIR_P) '$(DESTDIR)$(docdir)'"; \
-	  $(MKDIR_P) "$(DESTDIR)$(docdir)" || exit 1; \
-	fi; \
-	for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; \
-	done | $(am__base_list) | \
-	while read files; do \
-	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(docdir)'"; \
-	  $(INSTALL_DATA) $$files "$(DESTDIR)$(docdir)" || exit $$?; \
-	done
+src/helpers/libelpa@SUFFIX@_private_la-aligned_mem.lo: src/helpers/aligned_mem.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-aligned_mem.lo `test -f 'src/helpers/aligned_mem.F90' || echo '$(srcdir)/'`src/helpers/aligned_mem.F90
 
-uninstall-dist_docDATA:
-	@$(NORMAL_UNINSTALL)
-	@list='$(dist_doc_DATA)'; test -n "$(docdir)" || list=; \
-	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	dir='$(DESTDIR)$(docdir)'; $(am__uninstall_files_from_dir)
-install-dist_filesDATA: $(dist_files_DATA)
-	@$(NORMAL_INSTALL)
-	@list='$(dist_files_DATA)'; test -n "$(filesdir)" || list=; \
-	if test -n "$$list"; then \
-	  echo " $(MKDIR_P) '$(DESTDIR)$(filesdir)'"; \
-	  $(MKDIR_P) "$(DESTDIR)$(filesdir)" || exit 1; \
-	fi; \
-	for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; \
-	done | $(am__base_list) | \
-	while read files; do \
-	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(filesdir)'"; \
-	  $(INSTALL_DATA) $$files "$(DESTDIR)$(filesdir)" || exit $$?; \
-	done
+src/elpa1/libelpa@SUFFIX@_private_la-elpa1_compute_private.lo: src/elpa1/elpa1_compute_private.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa1/libelpa@SUFFIX@_private_la-elpa1_compute_private.lo `test -f 'src/elpa1/elpa1_compute_private.F90' || echo '$(srcdir)/'`src/elpa1/elpa1_compute_private.F90
 
-uninstall-dist_filesDATA:
-	@$(NORMAL_UNINSTALL)
-	@list='$(dist_files_DATA)'; test -n "$(filesdir)" || list=; \
-	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	dir='$(DESTDIR)$(filesdir)'; $(am__uninstall_files_from_dir)
-install-pkgconfigDATA: $(pkgconfig_DATA)
-	@$(NORMAL_INSTALL)
-	@list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
-	if test -n "$$list"; then \
-	  echo " $(MKDIR_P) '$(DESTDIR)$(pkgconfigdir)'"; \
-	  $(MKDIR_P) "$(DESTDIR)$(pkgconfigdir)" || exit 1; \
-	fi; \
-	for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; \
-	done | $(am__base_list) | \
-	while read files; do \
-	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pkgconfigdir)'"; \
-	  $(INSTALL_DATA) $$files "$(DESTDIR)$(pkgconfigdir)" || exit $$?; \
-	done
+src/elpa1/libelpa@SUFFIX@_private_la-elpa1_auxiliary.lo: src/elpa1/elpa1_auxiliary.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa1/libelpa@SUFFIX@_private_la-elpa1_auxiliary.lo `test -f 'src/elpa1/elpa1_auxiliary.F90' || echo '$(srcdir)/'`src/elpa1/elpa1_auxiliary.F90
 
-uninstall-pkgconfigDATA:
-	@$(NORMAL_UNINSTALL)
-	@list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
-	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	dir='$(DESTDIR)$(pkgconfigdir)'; $(am__uninstall_files_from_dir)
-install-nobase_elpa_includeHEADERS: $(nobase_elpa_include_HEADERS)
-	@$(NORMAL_INSTALL)
-	@list='$(nobase_elpa_include_HEADERS)'; test -n "$(elpa_includedir)" || list=; \
-	if test -n "$$list"; then \
-	  echo " $(MKDIR_P) '$(DESTDIR)$(elpa_includedir)'"; \
-	  $(MKDIR_P) "$(DESTDIR)$(elpa_includedir)" || exit 1; \
-	fi; \
-	$(am__nobase_list) | while read dir files; do \
-	  xfiles=; for file in $$files; do \
-	    if test -f "$$file"; then xfiles="$$xfiles $$file"; \
-	    else xfiles="$$xfiles $(srcdir)/$$file"; fi; done; \
-	  test -z "$$xfiles" || { \
-	    test "x$$dir" = x. || { \
-	      echo " $(MKDIR_P) '$(DESTDIR)$(elpa_includedir)/$$dir'"; \
-	      $(MKDIR_P) "$(DESTDIR)$(elpa_includedir)/$$dir"; }; \
-	    echo " $(INSTALL_HEADER) $$xfiles '$(DESTDIR)$(elpa_includedir)/$$dir'"; \
-	    $(INSTALL_HEADER) $$xfiles "$(DESTDIR)$(elpa_includedir)/$$dir" || exit $$?; }; \
-	done
+src/elpa2/libelpa@SUFFIX@_private_la-elpa2_determine_workload.lo: src/elpa2/elpa2_determine_workload.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/libelpa@SUFFIX@_private_la-elpa2_determine_workload.lo `test -f 'src/elpa2/elpa2_determine_workload.F90' || echo '$(srcdir)/'`src/elpa2/elpa2_determine_workload.F90
 
-uninstall-nobase_elpa_includeHEADERS:
-	@$(NORMAL_UNINSTALL)
-	@list='$(nobase_elpa_include_HEADERS)'; test -n "$(elpa_includedir)" || list=; \
-	$(am__nobase_strip_setup); files=`$(am__nobase_strip)`; \
-	dir='$(DESTDIR)$(elpa_includedir)'; $(am__uninstall_files_from_dir)
+src/elpa2/libelpa@SUFFIX@_private_la-elpa2_compute.lo: src/elpa2/elpa2_compute.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/libelpa@SUFFIX@_private_la-elpa2_compute.lo `test -f 'src/elpa2/elpa2_compute.F90' || echo '$(srcdir)/'`src/elpa2/elpa2_compute.F90
 
-ID: $(am__tagged_files)
-	$(am__define_uniq_tagged_files); mkid -fID $$unique
-tags: tags-am
-TAGS: tags
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-mod_single_hh_trafo_real.lo: src/elpa2/kernels/mod_single_hh_trafo_real.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-mod_single_hh_trafo_real.lo `test -f 'src/elpa2/kernels/mod_single_hh_trafo_real.F90' || echo '$(srcdir)/'`src/elpa2/kernels/mod_single_hh_trafo_real.F90
 
-tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	set x; \
-	here=`pwd`; \
-	$(am__define_uniq_tagged_files); \
-	shift; \
-	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  if test $$# -gt 0; then \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      "$$@" $$unique; \
-	  else \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      $$unique; \
-	  fi; \
-	fi
-ctags: ctags-am
+src/GPU/libelpa@SUFFIX@_private_la-check_for_gpu.lo: src/GPU/check_for_gpu.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/GPU/libelpa@SUFFIX@_private_la-check_for_gpu.lo `test -f 'src/GPU/check_for_gpu.F90' || echo '$(srcdir)/'`src/GPU/check_for_gpu.F90
 
-CTAGS: ctags
-ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	$(am__define_uniq_tagged_files); \
-	test -z "$(CTAGS_ARGS)$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$unique
+src/GPU/libelpa@SUFFIX@_private_la-mod_cuda.lo: src/GPU/mod_cuda.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/GPU/libelpa@SUFFIX@_private_la-mod_cuda.lo `test -f 'src/GPU/mod_cuda.F90' || echo '$(srcdir)/'`src/GPU/mod_cuda.F90
 
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && $(am__cd) $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) "$$here"
-cscope: cscope.files
-	test ! -s cscope.files \
-	  || $(CSCOPE) -b -q $(AM_CSCOPEFLAGS) $(CSCOPEFLAGS) -i cscope.files $(CSCOPE_ARGS)
-clean-cscope:
-	-rm -f cscope.files
-cscope.files: clean-cscope cscopelist
-cscopelist: cscopelist-am
+src/elpa2/GPU/libelpa@SUFFIX@_private_la-interface_c_kernel.lo: src/elpa2/GPU/interface_c_kernel.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/GPU/libelpa@SUFFIX@_private_la-interface_c_kernel.lo `test -f 'src/elpa2/GPU/interface_c_kernel.F90' || echo '$(srcdir)/'`src/elpa2/GPU/interface_c_kernel.F90
 
-cscopelist-am: $(am__tagged_files)
-	list='$(am__tagged_files)'; \
-	case "$(srcdir)" in \
-	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
-	  *) sdir=$(subdir)/$(srcdir) ;; \
-	esac; \
-	for i in $$list; do \
-	  if test -f "$$i"; then \
-	    echo "$(subdir)/$$i"; \
-	  else \
-	    echo "$$sdir/$$i"; \
-	  fi; \
-	done >> $(top_builddir)/cscope.files
+src/elpa2/libelpa@SUFFIX@_private_la-mod_pack_unpack_gpu.lo: src/elpa2/mod_pack_unpack_gpu.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/libelpa@SUFFIX@_private_la-mod_pack_unpack_gpu.lo `test -f 'src/elpa2/mod_pack_unpack_gpu.F90' || echo '$(srcdir)/'`src/elpa2/mod_pack_unpack_gpu.F90
 
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-	-rm -f cscope.out cscope.in.out cscope.po.out cscope.files
+src/elpa2/qr/libelpa@SUFFIX@_private_la-qr_utils.lo: src/elpa2/qr/qr_utils.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/qr/libelpa@SUFFIX@_private_la-qr_utils.lo `test -f 'src/elpa2/qr/qr_utils.F90' || echo '$(srcdir)/'`src/elpa2/qr/qr_utils.F90
 
-# Recover from deleted '.trs' file; this should ensure that
-# "rm -f foo.log; make foo.trs" re-run 'foo.test', and re-create
-# both 'foo.log' and 'foo.trs'.  Break the recipe in two subshells
-# to avoid problems with "make -n".
-.log.trs:
-	rm -f $< $@
-	$(MAKE) $(AM_MAKEFLAGS) $<
+src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_qrkernels.lo: src/elpa2/qr/elpa_qrkernels.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_qrkernels.lo `test -f 'src/elpa2/qr/elpa_qrkernels.F90' || echo '$(srcdir)/'`src/elpa2/qr/elpa_qrkernels.F90
 
-# Leading 'am--fnord' is there to ensure the list of targets does not
-# expand to empty, as could happen e.g. with make check TESTS=''.
-am--fnord $(TEST_LOGS) $(TEST_LOGS:.log=.trs): $(am__force_recheck)
-am--force-recheck:
-	@:
+src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_pdlarfb.lo: src/elpa2/qr/elpa_pdlarfb.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_pdlarfb.lo `test -f 'src/elpa2/qr/elpa_pdlarfb.F90' || echo '$(srcdir)/'`src/elpa2/qr/elpa_pdlarfb.F90
 
-$(TEST_SUITE_LOG): $(TEST_LOGS)
-	@$(am__set_TESTS_bases); \
-	am__f_ok () { test -f "$$1" && test -r "$$1"; }; \
-	redo_bases=`for i in $$bases; do \
-	              am__f_ok $$i.trs && am__f_ok $$i.log || echo $$i; \
-	            done`; \
-	if test -n "$$redo_bases"; then \
-	  redo_logs=`for i in $$redo_bases; do echo $$i.log; done`; \
-	  redo_results=`for i in $$redo_bases; do echo $$i.trs; done`; \
-	  if $(am__make_dryrun); then :; else \
-	    rm -f $$redo_logs && rm -f $$redo_results || exit 1; \
-	  fi; \
-	fi; \
-	if test -n "$$am__remaking_logs"; then \
-	  echo "fatal: making $(TEST_SUITE_LOG): possible infinite" \
-	       "recursion detected" >&2; \
-	elif test -n "$$redo_logs"; then \
-	  am__remaking_logs=yes $(MAKE) $(AM_MAKEFLAGS) $$redo_logs; \
-	fi; \
-	if $(am__make_dryrun); then :; else \
-	  st=0;  \
-	  errmsg="fatal: making $(TEST_SUITE_LOG): failed to create"; \
-	  for i in $$redo_bases; do \
-	    test -f $$i.trs && test -r $$i.trs \
-	      || { echo "$$errmsg $$i.trs" >&2; st=1; }; \
-	    test -f $$i.log && test -r $$i.log \
-	      || { echo "$$errmsg $$i.log" >&2; st=1; }; \
-	  done; \
-	  test $$st -eq 0 || exit 1; \
-	fi
-	@$(am__sh_e_setup); $(am__tty_colors); $(am__set_TESTS_bases); \
-	ws='[ 	]'; \
-	results=`for b in $$bases; do echo $$b.trs; done`; \
-	test -n "$$results" || results=/dev/null; \
-	all=`  grep "^$$ws*:test-result:"           $$results | wc -l`; \
-	pass=` grep "^$$ws*:test-result:$$ws*PASS"  $$results | wc -l`; \
-	fail=` grep "^$$ws*:test-result:$$ws*FAIL"  $$results | wc -l`; \
-	skip=` grep "^$$ws*:test-result:$$ws*SKIP"  $$results | wc -l`; \
-	xfail=`grep "^$$ws*:test-result:$$ws*XFAIL" $$results | wc -l`; \
-	xpass=`grep "^$$ws*:test-result:$$ws*XPASS" $$results | wc -l`; \
-	error=`grep "^$$ws*:test-result:$$ws*ERROR" $$results | wc -l`; \
-	if test `expr $$fail + $$xpass + $$error` -eq 0; then \
-	  success=true; \
-	else \
-	  success=false; \
-	fi; \
-	br='==================='; br=$$br$$br$$br$$br; \
-	result_count () \
-	{ \
-	    if test x"$$1" = x"--maybe-color"; then \
-	      maybe_colorize=yes; \
-	    elif test x"$$1" = x"--no-color"; then \
-	      maybe_colorize=no; \
-	    else \
-	      echo "$@: invalid 'result_count' usage" >&2; exit 4; \
-	    fi; \
-	    shift; \
-	    desc=$$1 count=$$2; \
-	    if test $$maybe_colorize = yes && test $$count -gt 0; then \
-	      color_start=$$3 color_end=$$std; \
-	    else \
-	      color_start= color_end=; \
-	    fi; \
-	    echo "$${color_start}# $$desc $$count$${color_end}"; \
-	}; \
-	create_testsuite_report () \
-	{ \
-	  result_count $$1 "TOTAL:" $$all   "$$brg"; \
-	  result_count $$1 "PASS: " $$pass  "$$grn"; \
-	  result_count $$1 "SKIP: " $$skip  "$$blu"; \
-	  result_count $$1 "XFAIL:" $$xfail "$$lgn"; \
-	  result_count $$1 "FAIL: " $$fail  "$$red"; \
-	  result_count $$1 "XPASS:" $$xpass "$$red"; \
-	  result_count $$1 "ERROR:" $$error "$$mgn"; \
-	}; \
-	{								\
-	  echo "$(PACKAGE_STRING): $(subdir)/$(TEST_SUITE_LOG)" |	\
-	    $(am__rst_title);						\
-	  create_testsuite_report --no-color;				\
-	  echo;								\
-	  echo ".. contents:: :depth: 2";				\
-	  echo;								\
-	  for b in $$bases; do echo $$b; done				\
-	    | $(am__create_global_log);					\
-	} >$(TEST_SUITE_LOG).tmp || exit 1;				\
-	mv $(TEST_SUITE_LOG).tmp $(TEST_SUITE_LOG);			\
-	if $$success; then						\
-	  col="$$grn";							\
-	 else								\
-	  col="$$red";							\
-	  test x"$$VERBOSE" = x || cat $(TEST_SUITE_LOG);		\
-	fi;								\
-	echo "$${col}$$br$${std}"; 					\
-	echo "$${col}Testsuite summary for $(PACKAGE_STRING)$${std}";	\
-	echo "$${col}$$br$${std}"; 					\
-	create_testsuite_report --maybe-color;				\
-	echo "$$col$$br$$std";						\
-	if $$success; then :; else					\
-	  echo "$${col}See $(subdir)/$(TEST_SUITE_LOG)$${std}";		\
-	  if test -n "$(PACKAGE_BUGREPORT)"; then			\
-	    echo "$${col}Please report to $(PACKAGE_BUGREPORT)$${std}";	\
-	  fi;								\
-	  echo "$$col$$br$$std";					\
-	fi;								\
-	$$success || exit 1
+src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_pdgeqrf.lo: src/elpa2/qr/elpa_pdgeqrf.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/qr/libelpa@SUFFIX@_private_la-elpa_pdgeqrf.lo `test -f 'src/elpa2/qr/elpa_pdgeqrf.F90' || echo '$(srcdir)/'`src/elpa2/qr/elpa_pdgeqrf.F90
 
-check-TESTS:
-	@list='$(RECHECK_LOGS)';           test -z "$$list" || rm -f $$list
-	@list='$(RECHECK_LOGS:.log=.trs)'; test -z "$$list" || rm -f $$list
-	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
-	@set +e; $(am__set_TESTS_bases); \
-	log_list=`for i in $$bases; do echo $$i.log; done`; \
-	trs_list=`for i in $$bases; do echo $$i.trs; done`; \
-	log_list=`echo $$log_list`; trs_list=`echo $$trs_list`; \
-	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) TEST_LOGS="$$log_list"; \
-	exit $$?;
-recheck: all $(check_SCRIPTS)
-	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
-	@set +e; $(am__set_TESTS_bases); \
-	bases=`for i in $$bases; do echo $$i; done \
-	         | $(am__list_recheck_tests)` || exit 1; \
-	log_list=`for i in $$bases; do echo $$i.log; done`; \
-	log_list=`echo $$log_list`; \
-	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) \
-	        am__force_recheck=am--force-recheck \
-	        TEST_LOGS="$$log_list"; \
-	exit $$?
-elpa1_test_real@SUFFIX@.sh.log: elpa1_test_real@SUFFIX@.sh
-	@p='elpa1_test_real@SUFFIX@.sh'; \
-	b='elpa1_test_real@SUFFIX@.sh'; \
+src/elpa1/libelpa@SUFFIX@_private_la-elpa1.lo: src/elpa1/elpa1.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa1/libelpa@SUFFIX@_private_la-elpa1.lo `test -f 'src/elpa1/elpa1.F90' || echo '$(srcdir)/'`src/elpa1/elpa1.F90
+
+src/elpa2/libelpa@SUFFIX@_private_la-elpa2.lo: src/elpa2/elpa2.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/libelpa@SUFFIX@_private_la-elpa2.lo `test -f 'src/elpa2/elpa2.F90' || echo '$(srcdir)/'`src/elpa2/elpa2.F90
+
+src/helpers/libelpa@SUFFIX@_private_la-matrix_plot.lo: src/helpers/matrix_plot.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-matrix_plot.lo `test -f 'src/helpers/matrix_plot.F90' || echo '$(srcdir)/'`src/helpers/matrix_plot.F90
+
+src/general/libelpa@SUFFIX@_private_la-mod_elpa_skewsymmetric_blas.lo: src/general/mod_elpa_skewsymmetric_blas.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/general/libelpa@SUFFIX@_private_la-mod_elpa_skewsymmetric_blas.lo `test -f 'src/general/mod_elpa_skewsymmetric_blas.F90' || echo '$(srcdir)/'`src/general/mod_elpa_skewsymmetric_blas.F90
+
+src/general/libelpa@SUFFIX@_private_la-elpa_utilities.lo: src/general/elpa_utilities.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/general/libelpa@SUFFIX@_private_la-elpa_utilities.lo `test -f 'src/general/elpa_utilities.F90' || echo '$(srcdir)/'`src/general/elpa_utilities.F90
+
+src/ftimings/libelpa@SUFFIX@_private_la-ftimings.lo: src/ftimings/ftimings.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/ftimings/libelpa@SUFFIX@_private_la-ftimings.lo `test -f 'src/ftimings/ftimings.F90' || echo '$(srcdir)/'`src/ftimings/ftimings.F90
+
+src/ftimings/libelpa@SUFFIX@_private_la-ftimings_type.lo: src/ftimings/ftimings_type.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/ftimings/libelpa@SUFFIX@_private_la-ftimings_type.lo `test -f 'src/ftimings/ftimings_type.F90' || echo '$(srcdir)/'`src/ftimings/ftimings_type.F90
+
+src/ftimings/libelpa@SUFFIX@_private_la-ftimings_value.lo: src/ftimings/ftimings_value.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/ftimings/libelpa@SUFFIX@_private_la-ftimings_value.lo `test -f 'src/ftimings/ftimings_value.F90' || echo '$(srcdir)/'`src/ftimings/ftimings_value.F90
+
+src/helpers/libelpa@SUFFIX@_private_la-timer_dummy.lo: src/helpers/timer_dummy.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-timer_dummy.lo `test -f 'src/helpers/timer_dummy.F90' || echo '$(srcdir)/'`src/helpers/timer_dummy.F90
+
+src/helpers/libelpa@SUFFIX@_private_la-mod_time_c.lo: src/helpers/mod_time_c.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-mod_time_c.lo `test -f 'src/helpers/mod_time_c.F90' || echo '$(srcdir)/'`src/helpers/mod_time_c.F90
+
+src/helpers/libelpa@SUFFIX@_private_la-mod_simd_kernel.lo: src/helpers/mod_simd_kernel.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/helpers/libelpa@SUFFIX@_private_la-mod_simd_kernel.lo `test -f 'src/helpers/mod_simd_kernel.F90' || echo '$(srcdir)/'`src/helpers/mod_simd_kernel.F90
+
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real.lo: src/elpa2/kernels/real.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-real.lo `test -f 'src/elpa2/kernels/real.F90' || echo '$(srcdir)/'`src/elpa2/kernels/real.F90
+
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-complex.lo: src/elpa2/kernels/complex.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-complex.lo `test -f 'src/elpa2/kernels/complex.F90' || echo '$(srcdir)/'`src/elpa2/kernels/complex.F90
+
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple.lo: src/elpa2/kernels/real_simple.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple.lo `test -f 'src/elpa2/kernels/real_simple.F90' || echo '$(srcdir)/'`src/elpa2/kernels/real_simple.F90
+
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-complex_simple.lo: src/elpa2/kernels/complex_simple.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-complex_simple.lo `test -f 'src/elpa2/kernels/complex_simple.F90' || echo '$(srcdir)/'`src/elpa2/kernels/complex_simple.F90
+
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple_block4.lo: src/elpa2/kernels/real_simple_block4.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple_block4.lo `test -f 'src/elpa2/kernels/real_simple_block4.F90' || echo '$(srcdir)/'`src/elpa2/kernels/real_simple_block4.F90
+
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple_block6.lo: src/elpa2/kernels/real_simple_block6.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_simple_block6.lo `test -f 'src/elpa2/kernels/real_simple_block6.F90' || echo '$(srcdir)/'`src/elpa2/kernels/real_simple_block6.F90
+
+src/libelpa@SUFFIX@_public_la-elpa.lo: src/elpa.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_public_la_FCFLAGS) $(FCFLAGS) -c -o src/libelpa@SUFFIX@_public_la-elpa.lo `test -f 'src/elpa.F90' || echo '$(srcdir)/'`src/elpa.F90
+
+src/libelpa@SUFFIX@_public_la-elpa_api.lo: src/elpa_api.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_public_la_FCFLAGS) $(FCFLAGS) -c -o src/libelpa@SUFFIX@_public_la-elpa_api.lo `test -f 'src/elpa_api.F90' || echo '$(srcdir)/'`src/elpa_api.F90
+
+src/libelpa@SUFFIX@_public_la-elpa_constants.lo: src/elpa_constants.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpa@SUFFIX@_public_la_FCFLAGS) $(FCFLAGS) -c -o src/libelpa@SUFFIX@_public_la-elpa_constants.lo `test -f 'src/elpa_constants.F90' || echo '$(srcdir)/'`src/elpa_constants.F90
+
+test/shared/libelpatest@SUFFIX@_la-tests_variable_definitions.lo: test/shared/tests_variable_definitions.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-tests_variable_definitions.lo `test -f 'test/shared/tests_variable_definitions.F90' || echo '$(srcdir)/'`test/shared/tests_variable_definitions.F90
+
+test/shared/libelpatest@SUFFIX@_la-mod_tests_scalapack_interfaces.lo: test/shared/mod_tests_scalapack_interfaces.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-mod_tests_scalapack_interfaces.lo `test -f 'test/shared/mod_tests_scalapack_interfaces.F90' || echo '$(srcdir)/'`test/shared/mod_tests_scalapack_interfaces.F90
+
+test/shared/libelpatest@SUFFIX@_la-mod_tests_blas_interfaces.lo: test/shared/mod_tests_blas_interfaces.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-mod_tests_blas_interfaces.lo `test -f 'test/shared/mod_tests_blas_interfaces.F90' || echo '$(srcdir)/'`test/shared/mod_tests_blas_interfaces.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_util.lo: test/shared/test_util.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_util.lo `test -f 'test/shared/test_util.F90' || echo '$(srcdir)/'`test/shared/test_util.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_read_input_parameters.lo: test/shared/test_read_input_parameters.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_read_input_parameters.lo `test -f 'test/shared/test_read_input_parameters.F90' || echo '$(srcdir)/'`test/shared/test_read_input_parameters.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_check_correctness.lo: test/shared/test_check_correctness.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_check_correctness.lo `test -f 'test/shared/test_check_correctness.F90' || echo '$(srcdir)/'`test/shared/test_check_correctness.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_setup_mpi.lo: test/shared/test_setup_mpi.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_setup_mpi.lo `test -f 'test/shared/test_setup_mpi.F90' || echo '$(srcdir)/'`test/shared/test_setup_mpi.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_blacs_infrastructure.lo: test/shared/test_blacs_infrastructure.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_blacs_infrastructure.lo `test -f 'test/shared/test_blacs_infrastructure.F90' || echo '$(srcdir)/'`test/shared/test_blacs_infrastructure.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_prepare_matrix.lo: test/shared/test_prepare_matrix.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_prepare_matrix.lo `test -f 'test/shared/test_prepare_matrix.F90' || echo '$(srcdir)/'`test/shared/test_prepare_matrix.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_analytic.lo: test/shared/test_analytic.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_analytic.lo `test -f 'test/shared/test_analytic.F90' || echo '$(srcdir)/'`test/shared/test_analytic.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_output_type.lo: test/shared/test_output_type.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_output_type.lo `test -f 'test/shared/test_output_type.F90' || echo '$(srcdir)/'`test/shared/test_output_type.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_scalapack.lo: test/shared/test_scalapack.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_scalapack.lo `test -f 'test/shared/test_scalapack.F90' || echo '$(srcdir)/'`test/shared/test_scalapack.F90
+
+test/shared/libelpatest@SUFFIX@_la-test_redirect.lo: test/shared/test_redirect.F90
+	$(AM_V_PPFC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libelpatest@SUFFIX@_la_FCFLAGS) $(FCFLAGS) -c -o test/shared/libelpatest@SUFFIX@_la-test_redirect.lo `test -f 'test/shared/test_redirect.F90' || echo '$(srcdir)/'`test/shared/test_redirect.F90
+
+src/elpa2/elpa2_print_kernels@SUFFIX@-elpa2_print_kernels.o: src/elpa2/elpa2_print_kernels.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(elpa2_print_kernels@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/elpa2_print_kernels@SUFFIX@-elpa2_print_kernels.o `test -f 'src/elpa2/elpa2_print_kernels.F90' || echo '$(srcdir)/'`src/elpa2/elpa2_print_kernels.F90
+
+src/elpa2/elpa2_print_kernels@SUFFIX@-elpa2_print_kernels.obj: src/elpa2/elpa2_print_kernels.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(elpa2_print_kernels@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/elpa2_print_kernels@SUFFIX@-elpa2_print_kernels.obj `if test -f 'src/elpa2/elpa2_print_kernels.F90'; then $(CYGPATH_W) 'src/elpa2/elpa2_print_kernels.F90'; else $(CYGPATH_W) '$(srcdir)/src/elpa2/elpa2_print_kernels.F90'; fi`
+
+test/Fortran/skewsymmetric_real_double-test_skewsymmetric.o: test/Fortran/test_skewsymmetric.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_skewsymmetric_real_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/skewsymmetric_real_double-test_skewsymmetric.o `test -f 'test/Fortran/test_skewsymmetric.F90' || echo '$(srcdir)/'`test/Fortran/test_skewsymmetric.F90
+
+test/Fortran/skewsymmetric_real_double-test_skewsymmetric.obj: test/Fortran/test_skewsymmetric.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_skewsymmetric_real_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/skewsymmetric_real_double-test_skewsymmetric.obj `if test -f 'test/Fortran/test_skewsymmetric.F90'; then $(CYGPATH_W) 'test/Fortran/test_skewsymmetric.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test_skewsymmetric.F90'; fi`
+
+test/Fortran/skewsymmetric_real_single-test_skewsymmetric.o: test/Fortran/test_skewsymmetric.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_skewsymmetric_real_single_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/skewsymmetric_real_single-test_skewsymmetric.o `test -f 'test/Fortran/test_skewsymmetric.F90' || echo '$(srcdir)/'`test/Fortran/test_skewsymmetric.F90
+
+test/Fortran/skewsymmetric_real_single-test_skewsymmetric.obj: test/Fortran/test_skewsymmetric.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_skewsymmetric_real_single_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/skewsymmetric_real_single-test_skewsymmetric.obj `if test -f 'test/Fortran/test_skewsymmetric.F90'; then $(CYGPATH_W) 'test/Fortran/test_skewsymmetric.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test_skewsymmetric.F90'; fi`
+
+test/Fortran/validate_autotune_complex_double-test_autotune.o: test/Fortran/test_autotune.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_complex_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_autotune_complex_double-test_autotune.o `test -f 'test/Fortran/test_autotune.F90' || echo '$(srcdir)/'`test/Fortran/test_autotune.F90
+
+test/Fortran/validate_autotune_complex_double-test_autotune.obj: test/Fortran/test_autotune.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_complex_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_autotune_complex_double-test_autotune.obj `if test -f 'test/Fortran/test_autotune.F90'; then $(CYGPATH_W) 'test/Fortran/test_autotune.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test_autotune.F90'; fi`
+
+test/Fortran/validate_autotune_complex_single-test_autotune.o: test/Fortran/test_autotune.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_complex_single_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_autotune_complex_single-test_autotune.o `test -f 'test/Fortran/test_autotune.F90' || echo '$(srcdir)/'`test/Fortran/test_autotune.F90
+
+test/Fortran/validate_autotune_complex_single-test_autotune.obj: test/Fortran/test_autotune.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_complex_single_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_autotune_complex_single-test_autotune.obj `if test -f 'test/Fortran/test_autotune.F90'; then $(CYGPATH_W) 'test/Fortran/test_autotune.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test_autotune.F90'; fi`
+
+test/Fortran/validate_autotune_real_double-test_autotune.o: test/Fortran/test_autotune.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_real_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_autotune_real_double-test_autotune.o `test -f 'test/Fortran/test_autotune.F90' || echo '$(srcdir)/'`test/Fortran/test_autotune.F90
+
+test/Fortran/validate_autotune_real_double-test_autotune.obj: test/Fortran/test_autotune.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_real_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_autotune_real_double-test_autotune.obj `if test -f 'test/Fortran/test_autotune.F90'; then $(CYGPATH_W) 'test/Fortran/test_autotune.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test_autotune.F90'; fi`
+
+test/Fortran/validate_autotune_real_single-test_autotune.o: test/Fortran/test_autotune.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_real_single_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_autotune_real_single-test_autotune.o `test -f 'test/Fortran/test_autotune.F90' || echo '$(srcdir)/'`test/Fortran/test_autotune.F90
+
+test/Fortran/validate_autotune_real_single-test_autotune.obj: test/Fortran/test_autotune.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_real_single_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_autotune_real_single-test_autotune.obj `if test -f 'test/Fortran/test_autotune.F90'; then $(CYGPATH_W) 'test/Fortran/test_autotune.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test_autotune.F90'; fi`
+
+test/Fortran/elpa2/validate_complex_2stage_banded@SUFFIX@-complex_2stage_banded.o: test/Fortran/elpa2/complex_2stage_banded.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_2stage_banded@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_complex_2stage_banded@SUFFIX@-complex_2stage_banded.o `test -f 'test/Fortran/elpa2/complex_2stage_banded.F90' || echo '$(srcdir)/'`test/Fortran/elpa2/complex_2stage_banded.F90
+
+test/Fortran/elpa2/validate_complex_2stage_banded@SUFFIX@-complex_2stage_banded.obj: test/Fortran/elpa2/complex_2stage_banded.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_2stage_banded@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_complex_2stage_banded@SUFFIX@-complex_2stage_banded.obj `if test -f 'test/Fortran/elpa2/complex_2stage_banded.F90'; then $(CYGPATH_W) 'test/Fortran/elpa2/complex_2stage_banded.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/elpa2/complex_2stage_banded.F90'; fi`
+
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_cholesky_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_cholesky_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_cholesky_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_cholesky_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_cholesky_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_cholesky_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_cholesky_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_cholesky_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_cholesky_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_scalapack_all_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_scalapack_all_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_scalapack_part_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_scalapack_part_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_generalized_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_generalized_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_generalized_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_generalized_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_generalized_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_generalized_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_generalized_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_generalized_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_decomp_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_decomp_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_generalized_decomp_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_decomp_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_decomp_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_generalized_decomp_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_decomp_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_decomp_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_generalized_decomp_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_decomp_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_generalized_decomp_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_generalized_decomp_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_hermitian_multiply_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_hermitian_multiply_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_hermitian_multiply_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_hermitian_multiply_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_hermitian_multiply_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_double_hermitian_multiply_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_double_hermitian_multiply_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_cholesky_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_cholesky_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_cholesky_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_cholesky_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_cholesky_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_cholesky_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_cholesky_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_cholesky_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_cholesky_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_generalized_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_generalized_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_generalized_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_generalized_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_generalized_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_generalized_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_generalized_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_generalized_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_decomp_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_decomp_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_generalized_decomp_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_decomp_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_decomp_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_generalized_decomp_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_decomp_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_decomp_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_generalized_decomp_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_decomp_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_generalized_decomp_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_generalized_decomp_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_hermitian_multiply_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_hermitian_multiply_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_hermitian_multiply_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_hermitian_multiply_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_hermitian_multiply_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_complex_single_hermitian_multiply_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_complex_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_complex_single_hermitian_multiply_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/elpa2/validate_double_instance@SUFFIX@-double_instance.o: test/Fortran/elpa2/double_instance.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_double_instance@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_double_instance@SUFFIX@-double_instance.o `test -f 'test/Fortran/elpa2/double_instance.F90' || echo '$(srcdir)/'`test/Fortran/elpa2/double_instance.F90
+
+test/Fortran/elpa2/validate_double_instance@SUFFIX@-double_instance.obj: test/Fortran/elpa2/double_instance.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_double_instance@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_double_instance@SUFFIX@-double_instance.obj `if test -f 'test/Fortran/elpa2/double_instance.F90'; then $(CYGPATH_W) 'test/Fortran/elpa2/double_instance.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/elpa2/double_instance.F90'; fi`
+
+test/Fortran/validate_multiple_objs_real_double-test_multiple_objs.o: test/Fortran/test_multiple_objs.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_multiple_objs_real_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_multiple_objs_real_double-test_multiple_objs.o `test -f 'test/Fortran/test_multiple_objs.F90' || echo '$(srcdir)/'`test/Fortran/test_multiple_objs.F90
+
+test/Fortran/validate_multiple_objs_real_double-test_multiple_objs.obj: test/Fortran/test_multiple_objs.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_multiple_objs_real_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_multiple_objs_real_double-test_multiple_objs.obj `if test -f 'test/Fortran/test_multiple_objs.F90'; then $(CYGPATH_W) 'test/Fortran/test_multiple_objs.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test_multiple_objs.F90'; fi`
+
+test/Fortran/elpa2/validate_real_2stage_banded@SUFFIX@-real_2stage_banded.o: test/Fortran/elpa2/real_2stage_banded.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_2stage_banded@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_real_2stage_banded@SUFFIX@-real_2stage_banded.o `test -f 'test/Fortran/elpa2/real_2stage_banded.F90' || echo '$(srcdir)/'`test/Fortran/elpa2/real_2stage_banded.F90
+
+test/Fortran/elpa2/validate_real_2stage_banded@SUFFIX@-real_2stage_banded.obj: test/Fortran/elpa2/real_2stage_banded.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_2stage_banded@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_real_2stage_banded@SUFFIX@-real_2stage_banded.obj `if test -f 'test/Fortran/elpa2/real_2stage_banded.F90'; then $(CYGPATH_W) 'test/Fortran/elpa2/real_2stage_banded.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/elpa2/real_2stage_banded.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random_split_comm_myself-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_random_split_comm_myself-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_random_split_comm_myself-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_random_split_comm_myself-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_random_split_comm_myself-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_random_split_comm_myself-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_random_split_comm_myself-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_random_split_comm_myself-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_cholesky_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_cholesky_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_cholesky_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_1stage_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_1stage_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_1stage_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_1stage_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_random_split_comm_myself-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_random_split_comm_myself-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_random_split_comm_myself-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_random_split_comm_myself-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_scalapack_all_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_scalapack_all_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_scalapack_part_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_scalapack_part_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_generalized_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_generalized_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_generalized_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_generalized_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_generalized_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_generalized_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_generalized_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_generalized_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_decomp_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_decomp_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_generalized_decomp_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_decomp_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_decomp_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_generalized_decomp_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_decomp_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_decomp_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_generalized_decomp_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_decomp_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_generalized_decomp_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_generalized_decomp_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_hermitian_multiply_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_hermitian_multiply_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_cholesky_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_cholesky_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_cholesky_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_cholesky_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_cholesky_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_cholesky_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_cholesky_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_cholesky_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_cholesky_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_cholesky_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_cholesky_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_cholesky_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_cholesky_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_generalized_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_generalized_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_generalized_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_generalized_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_generalized_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_generalized_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_generalized_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_generalized_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_decomp_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_decomp_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_generalized_decomp_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_decomp_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_decomp_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_generalized_decomp_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_decomp_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_decomp_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_generalized_decomp_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_decomp_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_generalized_decomp_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_generalized_decomp_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_hermitian_multiply_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_hermitian_multiply_1stage_gpu_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_hermitian_multiply_1stage_random-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_hermitian_multiply_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_hermitian_multiply_1stage_random-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_hermitian_multiply_1stage_random-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_hermitian_multiply_1stage_random_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_hermitian_multiply_1stage_random-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_hermitian_multiply_1stage_random_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_hermitian_multiply_1stage_random_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_hermitian_multiply_1stage_random_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_hermitian_multiply_1stage_random_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts-test.o: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts-test.o `test -f 'test/Fortran/test.F90' || echo '$(srcdir)/'`test/Fortran/test.F90
+
+test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts-test.obj: test/Fortran/test.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts-test.obj `if test -f 'test/Fortran/test.F90'; then $(CYGPATH_W) 'test/Fortran/test.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test.F90'; fi`
+
+test/Fortran/elpa2/validate_single_complex_2stage_banded@SUFFIX@-single_complex_2stage_banded.o: test/Fortran/elpa2/single_complex_2stage_banded.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_single_complex_2stage_banded@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_single_complex_2stage_banded@SUFFIX@-single_complex_2stage_banded.o `test -f 'test/Fortran/elpa2/single_complex_2stage_banded.F90' || echo '$(srcdir)/'`test/Fortran/elpa2/single_complex_2stage_banded.F90
+
+test/Fortran/elpa2/validate_single_complex_2stage_banded@SUFFIX@-single_complex_2stage_banded.obj: test/Fortran/elpa2/single_complex_2stage_banded.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_single_complex_2stage_banded@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_single_complex_2stage_banded@SUFFIX@-single_complex_2stage_banded.obj `if test -f 'test/Fortran/elpa2/single_complex_2stage_banded.F90'; then $(CYGPATH_W) 'test/Fortran/elpa2/single_complex_2stage_banded.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/elpa2/single_complex_2stage_banded.F90'; fi`
+
+test/Fortran/elpa2/validate_single_real_2stage_banded@SUFFIX@-single_real_2stage_banded.o: test/Fortran/elpa2/single_real_2stage_banded.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_single_real_2stage_banded@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_single_real_2stage_banded@SUFFIX@-single_real_2stage_banded.o `test -f 'test/Fortran/elpa2/single_real_2stage_banded.F90' || echo '$(srcdir)/'`test/Fortran/elpa2/single_real_2stage_banded.F90
+
+test/Fortran/elpa2/validate_single_real_2stage_banded@SUFFIX@-single_real_2stage_banded.obj: test/Fortran/elpa2/single_real_2stage_banded.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_single_real_2stage_banded@SUFFIX@_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/elpa2/validate_single_real_2stage_banded@SUFFIX@-single_real_2stage_banded.obj `if test -f 'test/Fortran/elpa2/single_real_2stage_banded.F90'; then $(CYGPATH_W) 'test/Fortran/elpa2/single_real_2stage_banded.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/elpa2/single_real_2stage_banded.F90'; fi`
+
+test/Fortran/validate_split_comm_real_double-test_split_comm.o: test/Fortran/test_split_comm.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_split_comm_real_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_split_comm_real_double-test_split_comm.o `test -f 'test/Fortran/test_split_comm.F90' || echo '$(srcdir)/'`test/Fortran/test_split_comm.F90
+
+test/Fortran/validate_split_comm_real_double-test_split_comm.obj: test/Fortran/test_split_comm.F90
+	$(AM_V_PPFC)$(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_split_comm_real_double_FCFLAGS) $(FCFLAGS) -c -o test/Fortran/validate_split_comm_real_double-test_split_comm.obj `if test -f 'test/Fortran/test_split_comm.F90'; then $(CYGPATH_W) 'test/Fortran/test_split_comm.F90'; else $(CYGPATH_W) '$(srcdir)/test/Fortran/test_split_comm.F90'; fi`
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+python/pyelpa/wrapper_la-wrapper.lo: python/pyelpa/wrapper.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(wrapper_la_CFLAGS) $(CFLAGS) -MT python/pyelpa/wrapper_la-wrapper.lo -MD -MP -MF python/pyelpa/$(DEPDIR)/wrapper_la-wrapper.Tpo -c -o python/pyelpa/wrapper_la-wrapper.lo `test -f 'python/pyelpa/wrapper.c' || echo '$(srcdir)/'`python/pyelpa/wrapper.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) python/pyelpa/$(DEPDIR)/wrapper_la-wrapper.Tpo python/pyelpa/$(DEPDIR)/wrapper_la-wrapper.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='python/pyelpa/wrapper.c' object='python/pyelpa/wrapper_la-wrapper.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(wrapper_la_CFLAGS) $(CFLAGS) -c -o python/pyelpa/wrapper_la-wrapper.lo `test -f 'python/pyelpa/wrapper.c' || echo '$(srcdir)/'`python/pyelpa/wrapper.c
+
+test/C/validate_autotune_c_version_complex_double-test_autotune.o: test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_complex_double_CFLAGS) $(CFLAGS) -MT test/C/validate_autotune_c_version_complex_double-test_autotune.o -MD -MP -MF test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Tpo -c -o test/C/validate_autotune_c_version_complex_double-test_autotune.o `test -f 'test/C/test_autotune.c' || echo '$(srcdir)/'`test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Tpo test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_autotune.c' object='test/C/validate_autotune_c_version_complex_double-test_autotune.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_complex_double_CFLAGS) $(CFLAGS) -c -o test/C/validate_autotune_c_version_complex_double-test_autotune.o `test -f 'test/C/test_autotune.c' || echo '$(srcdir)/'`test/C/test_autotune.c
+
+test/C/validate_autotune_c_version_complex_double-test_autotune.obj: test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_complex_double_CFLAGS) $(CFLAGS) -MT test/C/validate_autotune_c_version_complex_double-test_autotune.obj -MD -MP -MF test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Tpo -c -o test/C/validate_autotune_c_version_complex_double-test_autotune.obj `if test -f 'test/C/test_autotune.c'; then $(CYGPATH_W) 'test/C/test_autotune.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_autotune.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Tpo test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_autotune.c' object='test/C/validate_autotune_c_version_complex_double-test_autotune.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_complex_double_CFLAGS) $(CFLAGS) -c -o test/C/validate_autotune_c_version_complex_double-test_autotune.obj `if test -f 'test/C/test_autotune.c'; then $(CYGPATH_W) 'test/C/test_autotune.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_autotune.c'; fi`
+
+test/C/validate_autotune_c_version_complex_single-test_autotune.o: test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_complex_single_CFLAGS) $(CFLAGS) -MT test/C/validate_autotune_c_version_complex_single-test_autotune.o -MD -MP -MF test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Tpo -c -o test/C/validate_autotune_c_version_complex_single-test_autotune.o `test -f 'test/C/test_autotune.c' || echo '$(srcdir)/'`test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Tpo test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_autotune.c' object='test/C/validate_autotune_c_version_complex_single-test_autotune.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_complex_single_CFLAGS) $(CFLAGS) -c -o test/C/validate_autotune_c_version_complex_single-test_autotune.o `test -f 'test/C/test_autotune.c' || echo '$(srcdir)/'`test/C/test_autotune.c
+
+test/C/validate_autotune_c_version_complex_single-test_autotune.obj: test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_complex_single_CFLAGS) $(CFLAGS) -MT test/C/validate_autotune_c_version_complex_single-test_autotune.obj -MD -MP -MF test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Tpo -c -o test/C/validate_autotune_c_version_complex_single-test_autotune.obj `if test -f 'test/C/test_autotune.c'; then $(CYGPATH_W) 'test/C/test_autotune.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_autotune.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Tpo test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_autotune.c' object='test/C/validate_autotune_c_version_complex_single-test_autotune.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_complex_single_CFLAGS) $(CFLAGS) -c -o test/C/validate_autotune_c_version_complex_single-test_autotune.obj `if test -f 'test/C/test_autotune.c'; then $(CYGPATH_W) 'test/C/test_autotune.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_autotune.c'; fi`
+
+test/C/validate_autotune_c_version_real_double-test_autotune.o: test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_real_double_CFLAGS) $(CFLAGS) -MT test/C/validate_autotune_c_version_real_double-test_autotune.o -MD -MP -MF test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Tpo -c -o test/C/validate_autotune_c_version_real_double-test_autotune.o `test -f 'test/C/test_autotune.c' || echo '$(srcdir)/'`test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Tpo test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_autotune.c' object='test/C/validate_autotune_c_version_real_double-test_autotune.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_real_double_CFLAGS) $(CFLAGS) -c -o test/C/validate_autotune_c_version_real_double-test_autotune.o `test -f 'test/C/test_autotune.c' || echo '$(srcdir)/'`test/C/test_autotune.c
+
+test/C/validate_autotune_c_version_real_double-test_autotune.obj: test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_real_double_CFLAGS) $(CFLAGS) -MT test/C/validate_autotune_c_version_real_double-test_autotune.obj -MD -MP -MF test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Tpo -c -o test/C/validate_autotune_c_version_real_double-test_autotune.obj `if test -f 'test/C/test_autotune.c'; then $(CYGPATH_W) 'test/C/test_autotune.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_autotune.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Tpo test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_autotune.c' object='test/C/validate_autotune_c_version_real_double-test_autotune.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_real_double_CFLAGS) $(CFLAGS) -c -o test/C/validate_autotune_c_version_real_double-test_autotune.obj `if test -f 'test/C/test_autotune.c'; then $(CYGPATH_W) 'test/C/test_autotune.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_autotune.c'; fi`
+
+test/C/validate_autotune_c_version_real_single-test_autotune.o: test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_real_single_CFLAGS) $(CFLAGS) -MT test/C/validate_autotune_c_version_real_single-test_autotune.o -MD -MP -MF test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Tpo -c -o test/C/validate_autotune_c_version_real_single-test_autotune.o `test -f 'test/C/test_autotune.c' || echo '$(srcdir)/'`test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Tpo test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_autotune.c' object='test/C/validate_autotune_c_version_real_single-test_autotune.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_real_single_CFLAGS) $(CFLAGS) -c -o test/C/validate_autotune_c_version_real_single-test_autotune.o `test -f 'test/C/test_autotune.c' || echo '$(srcdir)/'`test/C/test_autotune.c
+
+test/C/validate_autotune_c_version_real_single-test_autotune.obj: test/C/test_autotune.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_real_single_CFLAGS) $(CFLAGS) -MT test/C/validate_autotune_c_version_real_single-test_autotune.obj -MD -MP -MF test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Tpo -c -o test/C/validate_autotune_c_version_real_single-test_autotune.obj `if test -f 'test/C/test_autotune.c'; then $(CYGPATH_W) 'test/C/test_autotune.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_autotune.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Tpo test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_autotune.c' object='test/C/validate_autotune_c_version_real_single-test_autotune.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_autotune_c_version_real_single_CFLAGS) $(CFLAGS) -c -o test/C/validate_autotune_c_version_real_single-test_autotune.obj `if test -f 'test/C/test_autotune.c'; then $(CYGPATH_W) 'test/C/test_autotune.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_autotune.c'; fi`
+
+test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_eigenvectors_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Tpo -c -o test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Tpo -c -o test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_generalized_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_double_generalized_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_generalized_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_double_generalized_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_generalized_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_generalized_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_double_generalized_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_generalized_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_double_generalized_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_generalized_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_generalized_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_double_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_double_generalized_decomp_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_eigenvectors_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Tpo -c -o test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Tpo -c -o test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_generalized_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_single_generalized_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_generalized_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_single_generalized_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_generalized_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_generalized_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_single_generalized_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_generalized_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_single_generalized_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_generalized_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_generalized_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Tpo -c -o test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_complex_single_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_complex_single_generalized_decomp_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_eigenvectors_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Tpo -c -o test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Tpo -c -o test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_generalized_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_double_generalized_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_generalized_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_double_generalized_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_generalized_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_generalized_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_double_generalized_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_generalized_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_double_generalized_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_generalized_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_generalized_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_double_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_double_generalized_decomp_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_eigenvectors_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Tpo -c -o test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Tpo -c -o test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_generalized_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_single_generalized_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_generalized_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_single_generalized_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_generalized_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_generalized_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_single_generalized_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_generalized_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_single_generalized_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_generalized_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_generalized_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Tpo -c -o test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_decomp_1stage_gpu_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.o: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.o -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.o `test -f 'test/C/test.c' || echo '$(srcdir)/'`test/C/test.c
+
+test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.obj: test/C/test.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -MT test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.obj -MD -MP -MF test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Tpo -c -o test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Tpo test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test.c' object='test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_c_version_real_single_generalized_decomp_1stage_random_CFLAGS) $(CFLAGS) -c -o test/C/validate_c_version_real_single_generalized_decomp_1stage_random-test.obj `if test -f 'test/C/test.c'; then $(CYGPATH_W) 'test/C/test.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test.c'; fi`
+
+test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.o: test/C/test_multiple_objs.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_multiple_objs_real_double_c_version_CFLAGS) $(CFLAGS) -MT test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.o -MD -MP -MF test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Tpo -c -o test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.o `test -f 'test/C/test_multiple_objs.c' || echo '$(srcdir)/'`test/C/test_multiple_objs.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Tpo test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_multiple_objs.c' object='test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_multiple_objs_real_double_c_version_CFLAGS) $(CFLAGS) -c -o test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.o `test -f 'test/C/test_multiple_objs.c' || echo '$(srcdir)/'`test/C/test_multiple_objs.c
+
+test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.obj: test/C/test_multiple_objs.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_multiple_objs_real_double_c_version_CFLAGS) $(CFLAGS) -MT test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.obj -MD -MP -MF test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Tpo -c -o test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.obj `if test -f 'test/C/test_multiple_objs.c'; then $(CYGPATH_W) 'test/C/test_multiple_objs.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_multiple_objs.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Tpo test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='test/C/test_multiple_objs.c' object='test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(validate_multiple_objs_real_double_c_version_CFLAGS) $(CFLAGS) -c -o test/C/validate_multiple_objs_real_double_c_version-test_multiple_objs.obj `if test -f 'test/C/test_multiple_objs.c'; then $(CYGPATH_W) 'test/C/test_multiple_objs.c'; else $(CYGPATH_W) '$(srcdir)/test/C/test_multiple_objs.c'; fi`
+
+.f90.o:
+	$(AM_V_FC)$(FCCOMPILE) -c -o $@ $<
+
+.f90.obj:
+	$(AM_V_FC)$(FCCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.f90.lo:
+	$(AM_V_FC)$(LTFCCOMPILE) -c -o $@ $<
+
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_bgp.lo: src/elpa2/kernels/real_bgp.f90
+	$(AM_V_FC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_bgp.lo `test -f 'src/elpa2/kernels/real_bgp.f90' || echo '$(srcdir)/'`src/elpa2/kernels/real_bgp.f90
+
+src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_bgq.lo: src/elpa2/kernels/real_bgq.f90
+	$(AM_V_FC)$(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(FC) $(libelpa@SUFFIX@_private_la_FCFLAGS) $(FCFLAGS) -c -o src/elpa2/kernels/libelpa@SUFFIX@_private_la-real_bgq.lo `test -f 'src/elpa2/kernels/real_bgq.f90' || echo '$(srcdir)/'`src/elpa2/kernels/real_bgq.f90
+
+.s.o:
+	$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ $<
+
+.s.obj:
+	$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.s.lo:
+	$(AM_V_CCAS)$(LTCCASCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+	-rm -rf python/pyelpa/.libs python/pyelpa/_libs
+	-rm -rf src/.libs src/_libs
+	-rm -rf src/GPU/.libs src/GPU/_libs
+	-rm -rf src/elpa1/.libs src/elpa1/_libs
+	-rm -rf src/elpa2/.libs src/elpa2/_libs
+	-rm -rf src/elpa2/GPU/.libs src/elpa2/GPU/_libs
+	-rm -rf src/elpa2/kernels/.libs src/elpa2/kernels/_libs
+	-rm -rf src/elpa2/qr/.libs src/elpa2/qr/_libs
+	-rm -rf src/elpa_generalized/.libs src/elpa_generalized/_libs
+	-rm -rf src/ftimings/.libs src/ftimings/_libs
+	-rm -rf src/general/.libs src/general/_libs
+	-rm -rf src/helpers/.libs src/helpers/_libs
+	-rm -rf test/shared/.libs test/shared/_libs
+
+distclean-libtool:
+	-rm -f libtool config.lt
+install-pyelpaPYTHON: $(pyelpa_PYTHON)
+	@$(NORMAL_INSTALL)
+	@list='$(pyelpa_PYTHON)'; dlist=; list2=; test -n "$(pyelpadir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(pyelpadir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(pyelpadir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then b=; else b="$(srcdir)/"; fi; \
+	  if test -f $$b$$p; then \
+	    $(am__strip_dir) \
+	    dlist="$$dlist $$f"; \
+	    list2="$$list2 $$b$$p"; \
+	  else :; fi; \
+	done; \
+	for file in $$list2; do echo $$file; done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pyelpadir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(pyelpadir)" || exit $$?; \
+	done || exit $$?; \
+	if test -n "$$dlist"; then \
+	  $(am__py_compile) --destdir "$(DESTDIR)" \
+	                    --basedir "$(pyelpadir)" $$dlist; \
+	else :; fi
+
+uninstall-pyelpaPYTHON:
+	@$(NORMAL_UNINSTALL)
+	@list='$(pyelpa_PYTHON)'; test -n "$(pyelpadir)" || list=; \
+	py_files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	test -n "$$py_files" || exit 0; \
+	dir='$(DESTDIR)$(pyelpadir)'; \
+	pyc_files=`echo "$$py_files" | sed 's|$$|c|'`; \
+	pyo_files=`echo "$$py_files" | sed 's|$$|o|'`; \
+	st=0; \
+	for files in "$$py_files" "$$pyc_files" "$$pyo_files"; do \
+	  $(am__uninstall_files_from_dir) || st=$$?; \
+	done; \
+	dir='$(DESTDIR)$(pyelpadir)/__pycache__'; \
+	echo "$$py_files" | $(am__pep3147_tweak) | $(am__base_list) | \
+	  while read files; do \
+	    $(am__uninstall_files_from_dir) || st=$$?; \
+	  done || exit $$?; \
+	exit $$st
+install-man1: $(dist_man_MANS)
+	@$(NORMAL_INSTALL)
+	@list1=''; \
+	list2='$(dist_man_MANS)'; \
+	test -n "$(man1dir)" \
+	  && test -n "`echo $$list1$$list2`" \
+	  || exit 0; \
+	echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \
+	$(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \
+	{ for i in $$list1; do echo "$$i"; done;  \
+	if test -n "$$list2"; then \
+	  for i in $$list2; do echo "$$i"; done \
+	    | sed -n '/\.1[a-z]*$$/p'; \
+	fi; \
+	} | while read p; do \
+	  if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; echo "$$p"; \
+	done | \
+	sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
+	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \
+	sed 'N;N;s,\n, ,g' | { \
+	list=; while read file base inst; do \
+	  if test "$$base" = "$$inst"; then list="$$list $$file"; else \
+	    echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \
+	    $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \
+	  fi; \
+	done; \
+	for i in $$list; do echo "$$i"; done | $(am__base_list) | \
+	while read files; do \
+	  test -z "$$files" || { \
+	    echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \
+	    $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \
+	done; }
+
+uninstall-man1:
+	@$(NORMAL_UNINSTALL)
+	@list=''; test -n "$(man1dir)" || exit 0; \
+	files=`{ for i in $$list; do echo "$$i"; done; \
+	l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \
+	  sed -n '/\.1[a-z]*$$/p'; \
+	} | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
+	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
+	dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir)
+install-man3: $(dist_man_MANS)
+	@$(NORMAL_INSTALL)
+	@list1=''; \
+	list2='$(dist_man_MANS)'; \
+	test -n "$(man3dir)" \
+	  && test -n "`echo $$list1$$list2`" \
+	  || exit 0; \
+	echo " $(MKDIR_P) '$(DESTDIR)$(man3dir)'"; \
+	$(MKDIR_P) "$(DESTDIR)$(man3dir)" || exit 1; \
+	{ for i in $$list1; do echo "$$i"; done;  \
+	if test -n "$$list2"; then \
+	  for i in $$list2; do echo "$$i"; done \
+	    | sed -n '/\.3[a-z]*$$/p'; \
+	fi; \
+	} | while read p; do \
+	  if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; echo "$$p"; \
+	done | \
+	sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^3][0-9a-z]*$$,3,;x' \
+	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \
+	sed 'N;N;s,\n, ,g' | { \
+	list=; while read file base inst; do \
+	  if test "$$base" = "$$inst"; then list="$$list $$file"; else \
+	    echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man3dir)/$$inst'"; \
+	    $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man3dir)/$$inst" || exit $$?; \
+	  fi; \
+	done; \
+	for i in $$list; do echo "$$i"; done | $(am__base_list) | \
+	while read files; do \
+	  test -z "$$files" || { \
+	    echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man3dir)'"; \
+	    $(INSTALL_DATA) $$files "$(DESTDIR)$(man3dir)" || exit $$?; }; \
+	done; }
+
+uninstall-man3:
+	@$(NORMAL_UNINSTALL)
+	@list=''; test -n "$(man3dir)" || exit 0; \
+	files=`{ for i in $$list; do echo "$$i"; done; \
+	l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \
+	  sed -n '/\.3[a-z]*$$/p'; \
+	} | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^3][0-9a-z]*$$,3,;x' \
+	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
+	dir='$(DESTDIR)$(man3dir)'; $(am__uninstall_files_from_dir)
+install-dist_docDATA: $(dist_doc_DATA)
+	@$(NORMAL_INSTALL)
+	@list='$(dist_doc_DATA)'; test -n "$(docdir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(docdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(docdir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(docdir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(docdir)" || exit $$?; \
+	done
+
+uninstall-dist_docDATA:
+	@$(NORMAL_UNINSTALL)
+	@list='$(dist_doc_DATA)'; test -n "$(docdir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(docdir)'; $(am__uninstall_files_from_dir)
+install-pkgconfigDATA: $(pkgconfig_DATA)
+	@$(NORMAL_INSTALL)
+	@list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(pkgconfigdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(pkgconfigdir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pkgconfigdir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(pkgconfigdir)" || exit $$?; \
+	done
+
+uninstall-pkgconfigDATA:
+	@$(NORMAL_UNINSTALL)
+	@list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(pkgconfigdir)'; $(am__uninstall_files_from_dir)
+install-nobase_elpa_includeHEADERS: $(nobase_elpa_include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(nobase_elpa_include_HEADERS)'; test -n "$(elpa_includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(elpa_includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(elpa_includedir)" || exit 1; \
+	fi; \
+	$(am__nobase_list) | while read dir files; do \
+	  xfiles=; for file in $$files; do \
+	    if test -f "$$file"; then xfiles="$$xfiles $$file"; \
+	    else xfiles="$$xfiles $(srcdir)/$$file"; fi; done; \
+	  test -z "$$xfiles" || { \
+	    test "x$$dir" = x. || { \
+	      echo " $(MKDIR_P) '$(DESTDIR)$(elpa_includedir)/$$dir'"; \
+	      $(MKDIR_P) "$(DESTDIR)$(elpa_includedir)/$$dir"; }; \
+	    echo " $(INSTALL_HEADER) $$xfiles '$(DESTDIR)$(elpa_includedir)/$$dir'"; \
+	    $(INSTALL_HEADER) $$xfiles "$(DESTDIR)$(elpa_includedir)/$$dir" || exit $$?; }; \
+	done
+
+uninstall-nobase_elpa_includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(nobase_elpa_include_HEADERS)'; test -n "$(elpa_includedir)" || list=; \
+	$(am__nobase_strip_setup); files=`$(am__nobase_strip)`; \
+	dir='$(DESTDIR)$(elpa_includedir)'; $(am__uninstall_files_from_dir)
+install-nobase_nodist_elpa_includeHEADERS: $(nobase_nodist_elpa_include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(nobase_nodist_elpa_include_HEADERS)'; test -n "$(elpa_includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(elpa_includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(elpa_includedir)" || exit 1; \
+	fi; \
+	$(am__nobase_list) | while read dir files; do \
+	  xfiles=; for file in $$files; do \
+	    if test -f "$$file"; then xfiles="$$xfiles $$file"; \
+	    else xfiles="$$xfiles $(srcdir)/$$file"; fi; done; \
+	  test -z "$$xfiles" || { \
+	    test "x$$dir" = x. || { \
+	      echo " $(MKDIR_P) '$(DESTDIR)$(elpa_includedir)/$$dir'"; \
+	      $(MKDIR_P) "$(DESTDIR)$(elpa_includedir)/$$dir"; }; \
+	    echo " $(INSTALL_HEADER) $$xfiles '$(DESTDIR)$(elpa_includedir)/$$dir'"; \
+	    $(INSTALL_HEADER) $$xfiles "$(DESTDIR)$(elpa_includedir)/$$dir" || exit $$?; }; \
+	done
+
+uninstall-nobase_nodist_elpa_includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(nobase_nodist_elpa_include_HEADERS)'; test -n "$(elpa_includedir)" || list=; \
+	$(am__nobase_strip_setup); files=`$(am__nobase_strip)`; \
+	dir='$(DESTDIR)$(elpa_includedir)'; $(am__uninstall_files_from_dir)
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscope: cscope.files
+	test ! -s cscope.files \
+	  || $(CSCOPE) -b -q $(AM_CSCOPEFLAGS) $(CSCOPEFLAGS) -i cscope.files $(CSCOPE_ARGS)
+clean-cscope:
+	-rm -f cscope.files
+cscope.files: clean-cscope cscopelist
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+	-rm -f cscope.out cscope.in.out cscope.po.out cscope.files
+
+# Recover from deleted '.trs' file; this should ensure that
+# "rm -f foo.log; make foo.trs" re-run 'foo.test', and re-create
+# both 'foo.log' and 'foo.trs'.  Break the recipe in two subshells
+# to avoid problems with "make -n".
+.log.trs:
+	rm -f $< $@
+	$(MAKE) $(AM_MAKEFLAGS) $<
+
+# Leading 'am--fnord' is there to ensure the list of targets does not
+# expand to empty, as could happen e.g. with make check TESTS=''.
+am--fnord $(TEST_LOGS) $(TEST_LOGS:.log=.trs): $(am__force_recheck)
+am--force-recheck:
+	@:
+
+$(TEST_SUITE_LOG): $(TEST_LOGS)
+	@$(am__set_TESTS_bases); \
+	am__f_ok () { test -f "$$1" && test -r "$$1"; }; \
+	redo_bases=`for i in $$bases; do \
+	              am__f_ok $$i.trs && am__f_ok $$i.log || echo $$i; \
+	            done`; \
+	if test -n "$$redo_bases"; then \
+	  redo_logs=`for i in $$redo_bases; do echo $$i.log; done`; \
+	  redo_results=`for i in $$redo_bases; do echo $$i.trs; done`; \
+	  if $(am__make_dryrun); then :; else \
+	    rm -f $$redo_logs && rm -f $$redo_results || exit 1; \
+	  fi; \
+	fi; \
+	if test -n "$$am__remaking_logs"; then \
+	  echo "fatal: making $(TEST_SUITE_LOG): possible infinite" \
+	       "recursion detected" >&2; \
+	elif test -n "$$redo_logs"; then \
+	  am__remaking_logs=yes $(MAKE) $(AM_MAKEFLAGS) $$redo_logs; \
+	fi; \
+	if $(am__make_dryrun); then :; else \
+	  st=0;  \
+	  errmsg="fatal: making $(TEST_SUITE_LOG): failed to create"; \
+	  for i in $$redo_bases; do \
+	    test -f $$i.trs && test -r $$i.trs \
+	      || { echo "$$errmsg $$i.trs" >&2; st=1; }; \
+	    test -f $$i.log && test -r $$i.log \
+	      || { echo "$$errmsg $$i.log" >&2; st=1; }; \
+	  done; \
+	  test $$st -eq 0 || exit 1; \
+	fi
+	@$(am__sh_e_setup); $(am__tty_colors); $(am__set_TESTS_bases); \
+	ws='[ 	]'; \
+	results=`for b in $$bases; do echo $$b.trs; done`; \
+	test -n "$$results" || results=/dev/null; \
+	all=`  grep "^$$ws*:test-result:"           $$results | wc -l`; \
+	pass=` grep "^$$ws*:test-result:$$ws*PASS"  $$results | wc -l`; \
+	fail=` grep "^$$ws*:test-result:$$ws*FAIL"  $$results | wc -l`; \
+	skip=` grep "^$$ws*:test-result:$$ws*SKIP"  $$results | wc -l`; \
+	xfail=`grep "^$$ws*:test-result:$$ws*XFAIL" $$results | wc -l`; \
+	xpass=`grep "^$$ws*:test-result:$$ws*XPASS" $$results | wc -l`; \
+	error=`grep "^$$ws*:test-result:$$ws*ERROR" $$results | wc -l`; \
+	if test `expr $$fail + $$xpass + $$error` -eq 0; then \
+	  success=true; \
+	else \
+	  success=false; \
+	fi; \
+	br='==================='; br=$$br$$br$$br$$br; \
+	result_count () \
+	{ \
+	    if test x"$$1" = x"--maybe-color"; then \
+	      maybe_colorize=yes; \
+	    elif test x"$$1" = x"--no-color"; then \
+	      maybe_colorize=no; \
+	    else \
+	      echo "$@: invalid 'result_count' usage" >&2; exit 4; \
+	    fi; \
+	    shift; \
+	    desc=$$1 count=$$2; \
+	    if test $$maybe_colorize = yes && test $$count -gt 0; then \
+	      color_start=$$3 color_end=$$std; \
+	    else \
+	      color_start= color_end=; \
+	    fi; \
+	    echo "$${color_start}# $$desc $$count$${color_end}"; \
+	}; \
+	create_testsuite_report () \
+	{ \
+	  result_count $$1 "TOTAL:" $$all   "$$brg"; \
+	  result_count $$1 "PASS: " $$pass  "$$grn"; \
+	  result_count $$1 "SKIP: " $$skip  "$$blu"; \
+	  result_count $$1 "XFAIL:" $$xfail "$$lgn"; \
+	  result_count $$1 "FAIL: " $$fail  "$$red"; \
+	  result_count $$1 "XPASS:" $$xpass "$$red"; \
+	  result_count $$1 "ERROR:" $$error "$$mgn"; \
+	}; \
+	{								\
+	  echo "$(PACKAGE_STRING): $(subdir)/$(TEST_SUITE_LOG)" |	\
+	    $(am__rst_title);						\
+	  create_testsuite_report --no-color;				\
+	  echo;								\
+	  echo ".. contents:: :depth: 2";				\
+	  echo;								\
+	  for b in $$bases; do echo $$b; done				\
+	    | $(am__create_global_log);					\
+	} >$(TEST_SUITE_LOG).tmp || exit 1;				\
+	mv $(TEST_SUITE_LOG).tmp $(TEST_SUITE_LOG);			\
+	if $$success; then						\
+	  col="$$grn";							\
+	 else								\
+	  col="$$red";							\
+	  test x"$$VERBOSE" = x || cat $(TEST_SUITE_LOG);		\
+	fi;								\
+	echo "$${col}$$br$${std}"; 					\
+	echo "$${col}Testsuite summary for $(PACKAGE_STRING)$${std}";	\
+	echo "$${col}$$br$${std}"; 					\
+	create_testsuite_report --maybe-color;				\
+	echo "$$col$$br$$std";						\
+	if $$success; then :; else					\
+	  echo "$${col}See $(subdir)/$(TEST_SUITE_LOG)$${std}";		\
+	  if test -n "$(PACKAGE_BUGREPORT)"; then			\
+	    echo "$${col}Please report to $(PACKAGE_BUGREPORT)$${std}";	\
+	  fi;								\
+	  echo "$$col$$br$$std";					\
+	fi;								\
+	$$success || exit 1
+
+check-TESTS: $(check_SCRIPTS)
+	@list='$(RECHECK_LOGS)';           test -z "$$list" || rm -f $$list
+	@list='$(RECHECK_LOGS:.log=.trs)'; test -z "$$list" || rm -f $$list
+	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+	@set +e; $(am__set_TESTS_bases); \
+	log_list=`for i in $$bases; do echo $$i.log; done`; \
+	trs_list=`for i in $$bases; do echo $$i.trs; done`; \
+	log_list=`echo $$log_list`; trs_list=`echo $$trs_list`; \
+	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) TEST_LOGS="$$log_list"; \
+	exit $$?;
+recheck: all $(check_SCRIPTS)
+	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+	@set +e; $(am__set_TESTS_bases); \
+	bases=`for i in $$bases; do echo $$i; done \
+	         | $(am__list_recheck_tests)` || exit 1; \
+	log_list=`for i in $$bases; do echo $$i.log; done`; \
+	log_list=`echo $$log_list`; \
+	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) \
+	        am__force_recheck=am--force-recheck \
+	        TEST_LOGS="$$log_list"; \
+	exit $$?
+validate_c_version_complex_double_eigenvectors_1stage_random_default.sh.log: validate_c_version_complex_double_eigenvectors_1stage_random_default.sh
+	@p='validate_c_version_complex_double_eigenvectors_1stage_random_default.sh'; \
+	b='validate_c_version_complex_double_eigenvectors_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_default.sh.log: validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_default.sh
+	@p='validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	b='validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_double_eigenvectors_1stage_random_default.sh.log: validate_c_version_real_double_eigenvectors_1stage_random_default.sh
+	@p='validate_c_version_real_double_eigenvectors_1stage_random_default.sh'; \
+	b='validate_c_version_real_double_eigenvectors_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_default.sh.log: validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_default.sh
+	@p='validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	b='validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_single_eigenvectors_1stage_random_default.sh.log: validate_c_version_complex_single_eigenvectors_1stage_random_default.sh
+	@p='validate_c_version_complex_single_eigenvectors_1stage_random_default.sh'; \
+	b='validate_c_version_complex_single_eigenvectors_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_default.sh.log: validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_default.sh
+	@p='validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	b='validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_single_eigenvectors_1stage_random_default.sh.log: validate_c_version_real_single_eigenvectors_1stage_random_default.sh
+	@p='validate_c_version_real_single_eigenvectors_1stage_random_default.sh'; \
+	b='validate_c_version_real_single_eigenvectors_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_default.sh.log: validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_default.sh
+	@p='validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	b='validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_double_generalized_1stage_random_default.sh.log: validate_c_version_complex_double_generalized_1stage_random_default.sh
+	@p='validate_c_version_complex_double_generalized_1stage_random_default.sh'; \
+	b='validate_c_version_complex_double_generalized_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_double_generalized_1stage_random_default.sh.log: validate_c_version_real_double_generalized_1stage_random_default.sh
+	@p='validate_c_version_real_double_generalized_1stage_random_default.sh'; \
+	b='validate_c_version_real_double_generalized_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_single_generalized_1stage_random_default.sh.log: validate_c_version_complex_single_generalized_1stage_random_default.sh
+	@p='validate_c_version_complex_single_generalized_1stage_random_default.sh'; \
+	b='validate_c_version_complex_single_generalized_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_single_generalized_1stage_random_default.sh.log: validate_c_version_real_single_generalized_1stage_random_default.sh
+	@p='validate_c_version_real_single_generalized_1stage_random_default.sh'; \
+	b='validate_c_version_real_single_generalized_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_double_generalized_decomp_1stage_random_default.sh.log: validate_c_version_complex_double_generalized_decomp_1stage_random_default.sh
+	@p='validate_c_version_complex_double_generalized_decomp_1stage_random_default.sh'; \
+	b='validate_c_version_complex_double_generalized_decomp_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_double_generalized_decomp_1stage_random_default.sh.log: validate_c_version_real_double_generalized_decomp_1stage_random_default.sh
+	@p='validate_c_version_real_double_generalized_decomp_1stage_random_default.sh'; \
+	b='validate_c_version_real_double_generalized_decomp_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_single_generalized_decomp_1stage_random_default.sh.log: validate_c_version_complex_single_generalized_decomp_1stage_random_default.sh
+	@p='validate_c_version_complex_single_generalized_decomp_1stage_random_default.sh'; \
+	b='validate_c_version_complex_single_generalized_decomp_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_single_generalized_decomp_1stage_random_default.sh.log: validate_c_version_real_single_generalized_decomp_1stage_random_default.sh
+	@p='validate_c_version_real_single_generalized_decomp_1stage_random_default.sh'; \
+	b='validate_c_version_real_single_generalized_decomp_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_double_eigenvectors_1stage_gpu_random_default.sh.log: validate_c_version_complex_double_eigenvectors_1stage_gpu_random_default.sh
+	@p='validate_c_version_complex_double_eigenvectors_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_complex_double_eigenvectors_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh.log: validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+	@p='validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	b='validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_double_eigenvectors_1stage_gpu_random_default.sh.log: validate_c_version_real_double_eigenvectors_1stage_gpu_random_default.sh
+	@p='validate_c_version_real_double_eigenvectors_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_real_double_eigenvectors_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh.log: validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+	@p='validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	b='validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_single_eigenvectors_1stage_gpu_random_default.sh.log: validate_c_version_complex_single_eigenvectors_1stage_gpu_random_default.sh
+	@p='validate_c_version_complex_single_eigenvectors_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_complex_single_eigenvectors_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh.log: validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+	@p='validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	b='validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_single_eigenvectors_1stage_gpu_random_default.sh.log: validate_c_version_real_single_eigenvectors_1stage_gpu_random_default.sh
+	@p='validate_c_version_real_single_eigenvectors_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_real_single_eigenvectors_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh.log: validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+	@p='validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	b='validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_double_generalized_1stage_gpu_random_default.sh.log: validate_c_version_complex_double_generalized_1stage_gpu_random_default.sh
+	@p='validate_c_version_complex_double_generalized_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_complex_double_generalized_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_double_generalized_1stage_gpu_random_default.sh.log: validate_c_version_real_double_generalized_1stage_gpu_random_default.sh
+	@p='validate_c_version_real_double_generalized_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_real_double_generalized_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_single_generalized_1stage_gpu_random_default.sh.log: validate_c_version_complex_single_generalized_1stage_gpu_random_default.sh
+	@p='validate_c_version_complex_single_generalized_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_complex_single_generalized_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_single_generalized_1stage_gpu_random_default.sh.log: validate_c_version_real_single_generalized_1stage_gpu_random_default.sh
+	@p='validate_c_version_real_single_generalized_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_real_single_generalized_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_default.sh.log: validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_default.sh
+	@p='validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_double_generalized_decomp_1stage_gpu_random_default.sh.log: validate_c_version_real_double_generalized_decomp_1stage_gpu_random_default.sh
+	@p='validate_c_version_real_double_generalized_decomp_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_real_double_generalized_decomp_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_default.sh.log: validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_default.sh
+	@p='validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_c_version_real_single_generalized_decomp_1stage_gpu_random_default.sh.log: validate_c_version_real_single_generalized_decomp_1stage_gpu_random_default.sh
+	@p='validate_c_version_real_single_generalized_decomp_1stage_gpu_random_default.sh'; \
+	b='validate_c_version_real_single_generalized_decomp_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_analytic_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_1stage_analytic_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_1stage_analytic_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_analytic_default.sh.log: validate_complex_double_eigenvectors_1stage_analytic_default.sh
+	@p='validate_complex_double_eigenvectors_1stage_analytic_default.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_analytic_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_analytic_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_analytic_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_default.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_analytic_default.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_analytic_default.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_scalapack_all_analytic_default.sh.log: validate_complex_double_eigenvectors_scalapack_all_analytic_default.sh
+	@p='validate_complex_double_eigenvectors_scalapack_all_analytic_default.sh'; \
+	b='validate_complex_double_eigenvectors_scalapack_all_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_scalapack_part_analytic_default.sh.log: validate_complex_double_eigenvectors_scalapack_part_analytic_default.sh
+	@p='validate_complex_double_eigenvectors_scalapack_part_analytic_default.sh'; \
+	b='validate_complex_double_eigenvectors_scalapack_part_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_analytic_all_layouts_extended.sh.log: validate_real_double_eigenvectors_1stage_analytic_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_1stage_analytic_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_1stage_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_analytic_default.sh.log: validate_real_double_eigenvectors_1stage_analytic_default.sh
+	@p='validate_real_double_eigenvectors_1stage_analytic_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_analytic_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_analytic_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_analytic_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_analytic_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_analytic_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh.log: validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_scalapack_all_analytic_default.sh.log: validate_real_double_eigenvectors_scalapack_all_analytic_default.sh
+	@p='validate_real_double_eigenvectors_scalapack_all_analytic_default.sh'; \
+	b='validate_real_double_eigenvectors_scalapack_all_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh.log: validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_scalapack_part_analytic_default.sh.log: validate_real_double_eigenvectors_scalapack_part_analytic_default.sh
+	@p='validate_real_double_eigenvectors_scalapack_part_analytic_default.sh'; \
+	b='validate_real_double_eigenvectors_scalapack_part_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_analytic_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_1stage_analytic_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_1stage_analytic_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_analytic_default.sh.log: validate_complex_single_eigenvectors_1stage_analytic_default.sh
+	@p='validate_complex_single_eigenvectors_1stage_analytic_default.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_analytic_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_analytic_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_analytic_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_default.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_analytic_default.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_analytic_default.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_analytic_all_layouts_extended.sh.log: validate_real_single_eigenvectors_1stage_analytic_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_1stage_analytic_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_1stage_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_analytic_default.sh.log: validate_real_single_eigenvectors_1stage_analytic_default.sh
+	@p='validate_real_single_eigenvectors_1stage_analytic_default.sh'; \
+	b='validate_real_single_eigenvectors_1stage_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_analytic_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_analytic_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_analytic_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_default.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_analytic_default.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_analytic_default.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_analytic_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_1stage_frank_all_layouts_extended.sh.log: validate_real_double_eigenvalues_1stage_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvalues_1stage_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvalues_1stage_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_1stage_frank_default.sh.log: validate_real_double_eigenvalues_1stage_frank_default.sh
+	@p='validate_real_double_eigenvalues_1stage_frank_default.sh'; \
+	b='validate_real_double_eigenvalues_1stage_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_extended.sh.log: validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_default.sh.log: validate_real_double_eigenvalues_2stage_default_kernel_frank_default.sh
+	@p='validate_real_double_eigenvalues_2stage_default_kernel_frank_default.sh'; \
+	b='validate_real_double_eigenvalues_2stage_default_kernel_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_frank_all_layouts_extended.sh.log: validate_real_double_eigenvectors_1stage_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_1stage_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_1stage_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_frank_default.sh.log: validate_real_double_eigenvectors_1stage_frank_default.sh
+	@p='validate_real_double_eigenvectors_1stage_frank_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_frank_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_frank_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_frank_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_frank_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_frank_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_hermitian_multiply_1stage_frank_all_layouts_extended.sh.log: validate_real_double_hermitian_multiply_1stage_frank_all_layouts_extended.sh
+	@p='validate_real_double_hermitian_multiply_1stage_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_hermitian_multiply_1stage_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_hermitian_multiply_1stage_frank_default.sh.log: validate_real_double_hermitian_multiply_1stage_frank_default.sh
+	@p='validate_real_double_hermitian_multiply_1stage_frank_default.sh'; \
+	b='validate_real_double_hermitian_multiply_1stage_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_extended.sh.log: validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_1stage_gpu_frank_default.sh.log: validate_real_double_eigenvalues_1stage_gpu_frank_default.sh
+	@p='validate_real_double_eigenvalues_1stage_gpu_frank_default.sh'; \
+	b='validate_real_double_eigenvalues_1stage_gpu_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_extended.sh.log: validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_default.sh.log: validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_default.sh
+	@p='validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_default.sh'; \
+	b='validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_extended.sh.log: validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_gpu_frank_default.sh.log: validate_real_double_eigenvectors_1stage_gpu_frank_default.sh
+	@p='validate_real_double_eigenvectors_1stage_gpu_frank_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_gpu_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_extended.sh.log: validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_extended.sh
+	@p='validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_extended.sh'; \
+	b='validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_hermitian_multiply_1stage_gpu_frank_default.sh.log: validate_real_double_hermitian_multiply_1stage_gpu_frank_default.sh
+	@p='validate_real_double_hermitian_multiply_1stage_gpu_frank_default.sh'; \
+	b='validate_real_double_hermitian_multiply_1stage_gpu_frank_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_cholesky_1stage_random_all_layouts_extended.sh.log: validate_complex_double_cholesky_1stage_random_all_layouts_extended.sh
+	@p='validate_complex_double_cholesky_1stage_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_cholesky_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_cholesky_1stage_random_default.sh.log: validate_complex_double_cholesky_1stage_random_default.sh
+	@p='validate_complex_double_cholesky_1stage_random_default.sh'; \
+	b='validate_complex_double_cholesky_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_random_all_layouts_extended.sh.log: validate_real_double_cholesky_1stage_random_all_layouts_extended.sh
+	@p='validate_real_double_cholesky_1stage_random_all_layouts_extended.sh'; \
+	b='validate_real_double_cholesky_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_random_default.sh.log: validate_real_double_cholesky_1stage_random_default.sh
+	@p='validate_real_double_cholesky_1stage_random_default.sh'; \
+	b='validate_real_double_cholesky_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_random_split_comm_myself_default.sh.log: validate_real_double_cholesky_1stage_random_split_comm_myself_default.sh
+	@p='validate_real_double_cholesky_1stage_random_split_comm_myself_default.sh'; \
+	b='validate_real_double_cholesky_1stage_random_split_comm_myself_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_cholesky_1stage_random_all_layouts_extended.sh.log: validate_complex_single_cholesky_1stage_random_all_layouts_extended.sh
+	@p='validate_complex_single_cholesky_1stage_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_cholesky_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_cholesky_1stage_random_default.sh.log: validate_complex_single_cholesky_1stage_random_default.sh
+	@p='validate_complex_single_cholesky_1stage_random_default.sh'; \
+	b='validate_complex_single_cholesky_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_cholesky_1stage_random_all_layouts_extended.sh.log: validate_real_single_cholesky_1stage_random_all_layouts_extended.sh
+	@p='validate_real_single_cholesky_1stage_random_all_layouts_extended.sh'; \
+	b='validate_real_single_cholesky_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_cholesky_1stage_random_default.sh.log: validate_real_single_cholesky_1stage_random_default.sh
+	@p='validate_real_single_cholesky_1stage_random_default.sh'; \
+	b='validate_real_single_cholesky_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_random_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_1stage_random_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_1stage_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_random_default.sh.log: validate_complex_double_eigenvectors_1stage_random_default.sh
+	@p='validate_complex_double_eigenvectors_1stage_random_default.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_random_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_random_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_default.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_random_default.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_random_all_layouts_extended.sh.log: validate_real_double_eigenvectors_1stage_random_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_1stage_random_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_random_default.sh.log: validate_real_double_eigenvectors_1stage_random_default.sh
+	@p='validate_real_double_eigenvectors_1stage_random_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_random_split_comm_myself_default.sh.log: validate_real_double_eigenvectors_1stage_random_split_comm_myself_default.sh
+	@p='validate_real_double_eigenvectors_1stage_random_split_comm_myself_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_random_split_comm_myself_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_random_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_random_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_random_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_random_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_random_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_random_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_1stage_random_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_1stage_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_random_default.sh.log: validate_complex_single_eigenvectors_1stage_random_default.sh
+	@p='validate_complex_single_eigenvectors_1stage_random_default.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_random_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_random_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_random_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_random_default.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_random_default.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_random_all_layouts_extended.sh.log: validate_real_single_eigenvectors_1stage_random_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_1stage_random_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_random_default.sh.log: validate_real_single_eigenvectors_1stage_random_default.sh
+	@p='validate_real_single_eigenvectors_1stage_random_default.sh'; \
+	b='validate_real_single_eigenvectors_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_random_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_random_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_random_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_random_default.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_random_default.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_generalized_1stage_random_all_layouts_default.sh.log: validate_complex_double_generalized_1stage_random_all_layouts_default.sh
+	@p='validate_complex_double_generalized_1stage_random_all_layouts_default.sh'; \
+	b='validate_complex_double_generalized_1stage_random_all_layouts_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_generalized_1stage_random_default.sh.log: validate_complex_double_generalized_1stage_random_default.sh
+	@p='validate_complex_double_generalized_1stage_random_default.sh'; \
+	b='validate_complex_double_generalized_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_generalized_1stage_random_all_layouts_default.sh.log: validate_real_double_generalized_1stage_random_all_layouts_default.sh
+	@p='validate_real_double_generalized_1stage_random_all_layouts_default.sh'; \
+	b='validate_real_double_generalized_1stage_random_all_layouts_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_generalized_1stage_random_default.sh.log: validate_real_double_generalized_1stage_random_default.sh
+	@p='validate_real_double_generalized_1stage_random_default.sh'; \
+	b='validate_real_double_generalized_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_generalized_1stage_random_all_layouts_default.sh.log: validate_complex_single_generalized_1stage_random_all_layouts_default.sh
+	@p='validate_complex_single_generalized_1stage_random_all_layouts_default.sh'; \
+	b='validate_complex_single_generalized_1stage_random_all_layouts_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_generalized_1stage_random_default.sh.log: validate_complex_single_generalized_1stage_random_default.sh
+	@p='validate_complex_single_generalized_1stage_random_default.sh'; \
+	b='validate_complex_single_generalized_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_generalized_1stage_random_all_layouts_default.sh.log: validate_real_single_generalized_1stage_random_all_layouts_default.sh
+	@p='validate_real_single_generalized_1stage_random_all_layouts_default.sh'; \
+	b='validate_real_single_generalized_1stage_random_all_layouts_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_generalized_1stage_random_default.sh.log: validate_real_single_generalized_1stage_random_default.sh
+	@p='validate_real_single_generalized_1stage_random_default.sh'; \
+	b='validate_real_single_generalized_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_generalized_decomp_1stage_random_all_layouts_extended.sh.log: validate_complex_double_generalized_decomp_1stage_random_all_layouts_extended.sh
+	@p='validate_complex_double_generalized_decomp_1stage_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_generalized_decomp_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_generalized_decomp_1stage_random_default.sh.log: validate_complex_double_generalized_decomp_1stage_random_default.sh
+	@p='validate_complex_double_generalized_decomp_1stage_random_default.sh'; \
+	b='validate_complex_double_generalized_decomp_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_generalized_decomp_1stage_random_all_layouts_extended.sh.log: validate_real_double_generalized_decomp_1stage_random_all_layouts_extended.sh
+	@p='validate_real_double_generalized_decomp_1stage_random_all_layouts_extended.sh'; \
+	b='validate_real_double_generalized_decomp_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_generalized_decomp_1stage_random_default.sh.log: validate_real_double_generalized_decomp_1stage_random_default.sh
+	@p='validate_real_double_generalized_decomp_1stage_random_default.sh'; \
+	b='validate_real_double_generalized_decomp_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_generalized_decomp_1stage_random_all_layouts_extended.sh.log: validate_complex_single_generalized_decomp_1stage_random_all_layouts_extended.sh
+	@p='validate_complex_single_generalized_decomp_1stage_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_generalized_decomp_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_generalized_decomp_1stage_random_default.sh.log: validate_complex_single_generalized_decomp_1stage_random_default.sh
+	@p='validate_complex_single_generalized_decomp_1stage_random_default.sh'; \
+	b='validate_complex_single_generalized_decomp_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_generalized_decomp_1stage_random_all_layouts_extended.sh.log: validate_real_single_generalized_decomp_1stage_random_all_layouts_extended.sh
+	@p='validate_real_single_generalized_decomp_1stage_random_all_layouts_extended.sh'; \
+	b='validate_real_single_generalized_decomp_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_generalized_decomp_1stage_random_default.sh.log: validate_real_single_generalized_decomp_1stage_random_default.sh
+	@p='validate_real_single_generalized_decomp_1stage_random_default.sh'; \
+	b='validate_real_single_generalized_decomp_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_hermitian_multiply_1stage_random_all_layouts_extended.sh.log: validate_complex_double_hermitian_multiply_1stage_random_all_layouts_extended.sh
+	@p='validate_complex_double_hermitian_multiply_1stage_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_hermitian_multiply_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_hermitian_multiply_1stage_random_default.sh.log: validate_complex_double_hermitian_multiply_1stage_random_default.sh
+	@p='validate_complex_double_hermitian_multiply_1stage_random_default.sh'; \
+	b='validate_complex_double_hermitian_multiply_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_hermitian_multiply_1stage_random_all_layouts_extended.sh.log: validate_real_double_hermitian_multiply_1stage_random_all_layouts_extended.sh
+	@p='validate_real_double_hermitian_multiply_1stage_random_all_layouts_extended.sh'; \
+	b='validate_real_double_hermitian_multiply_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_hermitian_multiply_1stage_random_default.sh.log: validate_real_double_hermitian_multiply_1stage_random_default.sh
+	@p='validate_real_double_hermitian_multiply_1stage_random_default.sh'; \
+	b='validate_real_double_hermitian_multiply_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_hermitian_multiply_1stage_random_all_layouts_extended.sh.log: validate_complex_single_hermitian_multiply_1stage_random_all_layouts_extended.sh
+	@p='validate_complex_single_hermitian_multiply_1stage_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_hermitian_multiply_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_hermitian_multiply_1stage_random_default.sh.log: validate_complex_single_hermitian_multiply_1stage_random_default.sh
+	@p='validate_complex_single_hermitian_multiply_1stage_random_default.sh'; \
+	b='validate_complex_single_hermitian_multiply_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_hermitian_multiply_1stage_random_all_layouts_extended.sh.log: validate_real_single_hermitian_multiply_1stage_random_all_layouts_extended.sh
+	@p='validate_real_single_hermitian_multiply_1stage_random_all_layouts_extended.sh'; \
+	b='validate_real_single_hermitian_multiply_1stage_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_hermitian_multiply_1stage_random_default.sh.log: validate_real_single_hermitian_multiply_1stage_random_default.sh
+	@p='validate_real_single_hermitian_multiply_1stage_random_default.sh'; \
+	b='validate_real_single_hermitian_multiply_1stage_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_qr_random_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_qr_random_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_qr_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_qr_random_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_qr_random_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_qr_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_qr_random_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_qr_random_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_qr_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_default.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_qr_random_default.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_qr_random_default.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_qr_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_cholesky_1stage_gpu_random_all_layouts_extended.sh.log: validate_complex_double_cholesky_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_double_cholesky_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_cholesky_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_cholesky_1stage_gpu_random_default.sh.log: validate_complex_double_cholesky_1stage_gpu_random_default.sh
+	@p='validate_complex_double_cholesky_1stage_gpu_random_default.sh'; \
+	b='validate_complex_double_cholesky_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_gpu_random_all_layouts_extended.sh.log: validate_real_double_cholesky_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_real_double_cholesky_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_double_cholesky_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_gpu_random_default.sh.log: validate_real_double_cholesky_1stage_gpu_random_default.sh
+	@p='validate_real_double_cholesky_1stage_gpu_random_default.sh'; \
+	b='validate_real_double_cholesky_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_default.sh.log: validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_default.sh
+	@p='validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_default.sh'; \
+	b='validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_cholesky_1stage_gpu_random_all_layouts_extended.sh.log: validate_complex_single_cholesky_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_single_cholesky_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_cholesky_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_cholesky_1stage_gpu_random_default.sh.log: validate_complex_single_cholesky_1stage_gpu_random_default.sh
+	@p='validate_complex_single_cholesky_1stage_gpu_random_default.sh'; \
+	b='validate_complex_single_cholesky_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_cholesky_1stage_gpu_random_all_layouts_extended.sh.log: validate_real_single_cholesky_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_real_single_cholesky_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_single_cholesky_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_cholesky_1stage_gpu_random_default.sh.log: validate_real_single_cholesky_1stage_gpu_random_default.sh
+	@p='validate_real_single_cholesky_1stage_gpu_random_default.sh'; \
+	b='validate_real_single_cholesky_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_gpu_random_default.sh.log: validate_complex_double_eigenvectors_1stage_gpu_random_default.sh
+	@p='validate_complex_double_eigenvectors_1stage_gpu_random_default.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh.log: validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_gpu_random_default.sh.log: validate_real_double_eigenvectors_1stage_gpu_random_default.sh
+	@p='validate_real_double_eigenvectors_1stage_gpu_random_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_default.sh.log: validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_default.sh
+	@p='validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_gpu_random_default.sh.log: validate_complex_single_eigenvectors_1stage_gpu_random_default.sh
+	@p='validate_complex_single_eigenvectors_1stage_gpu_random_default.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh.log: validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_gpu_random_default.sh.log: validate_real_single_eigenvectors_1stage_gpu_random_default.sh
+	@p='validate_real_single_eigenvectors_1stage_gpu_random_default.sh'; \
+	b='validate_real_single_eigenvectors_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_generalized_1stage_gpu_random_all_layouts_default.sh.log: validate_complex_double_generalized_1stage_gpu_random_all_layouts_default.sh
+	@p='validate_complex_double_generalized_1stage_gpu_random_all_layouts_default.sh'; \
+	b='validate_complex_double_generalized_1stage_gpu_random_all_layouts_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_generalized_1stage_gpu_random_default.sh.log: validate_complex_double_generalized_1stage_gpu_random_default.sh
+	@p='validate_complex_double_generalized_1stage_gpu_random_default.sh'; \
+	b='validate_complex_double_generalized_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_generalized_1stage_gpu_random_all_layouts_default.sh.log: validate_real_double_generalized_1stage_gpu_random_all_layouts_default.sh
+	@p='validate_real_double_generalized_1stage_gpu_random_all_layouts_default.sh'; \
+	b='validate_real_double_generalized_1stage_gpu_random_all_layouts_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_generalized_1stage_gpu_random_default.sh.log: validate_real_double_generalized_1stage_gpu_random_default.sh
+	@p='validate_real_double_generalized_1stage_gpu_random_default.sh'; \
+	b='validate_real_double_generalized_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_generalized_1stage_gpu_random_all_layouts_default.sh.log: validate_complex_single_generalized_1stage_gpu_random_all_layouts_default.sh
+	@p='validate_complex_single_generalized_1stage_gpu_random_all_layouts_default.sh'; \
+	b='validate_complex_single_generalized_1stage_gpu_random_all_layouts_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_generalized_1stage_gpu_random_default.sh.log: validate_complex_single_generalized_1stage_gpu_random_default.sh
+	@p='validate_complex_single_generalized_1stage_gpu_random_default.sh'; \
+	b='validate_complex_single_generalized_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_generalized_1stage_gpu_random_all_layouts_default.sh.log: validate_real_single_generalized_1stage_gpu_random_all_layouts_default.sh
+	@p='validate_real_single_generalized_1stage_gpu_random_all_layouts_default.sh'; \
+	b='validate_real_single_generalized_1stage_gpu_random_all_layouts_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_generalized_1stage_gpu_random_default.sh.log: validate_real_single_generalized_1stage_gpu_random_default.sh
+	@p='validate_real_single_generalized_1stage_gpu_random_default.sh'; \
+	b='validate_real_single_generalized_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh.log: validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_generalized_decomp_1stage_gpu_random_default.sh.log: validate_complex_double_generalized_decomp_1stage_gpu_random_default.sh
+	@p='validate_complex_double_generalized_decomp_1stage_gpu_random_default.sh'; \
+	b='validate_complex_double_generalized_decomp_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh.log: validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_generalized_decomp_1stage_gpu_random_default.sh.log: validate_real_double_generalized_decomp_1stage_gpu_random_default.sh
+	@p='validate_real_double_generalized_decomp_1stage_gpu_random_default.sh'; \
+	b='validate_real_double_generalized_decomp_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh.log: validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_generalized_decomp_1stage_gpu_random_default.sh.log: validate_complex_single_generalized_decomp_1stage_gpu_random_default.sh
+	@p='validate_complex_single_generalized_decomp_1stage_gpu_random_default.sh'; \
+	b='validate_complex_single_generalized_decomp_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh.log: validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_generalized_decomp_1stage_gpu_random_default.sh.log: validate_real_single_generalized_decomp_1stage_gpu_random_default.sh
+	@p='validate_real_single_generalized_decomp_1stage_gpu_random_default.sh'; \
+	b='validate_real_single_generalized_decomp_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh.log: validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_hermitian_multiply_1stage_gpu_random_default.sh.log: validate_complex_double_hermitian_multiply_1stage_gpu_random_default.sh
+	@p='validate_complex_double_hermitian_multiply_1stage_gpu_random_default.sh'; \
+	b='validate_complex_double_hermitian_multiply_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh.log: validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_hermitian_multiply_1stage_gpu_random_default.sh.log: validate_real_double_hermitian_multiply_1stage_gpu_random_default.sh
+	@p='validate_real_double_hermitian_multiply_1stage_gpu_random_default.sh'; \
+	b='validate_real_double_hermitian_multiply_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh.log: validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_hermitian_multiply_1stage_gpu_random_default.sh.log: validate_complex_single_hermitian_multiply_1stage_gpu_random_default.sh
+	@p='validate_complex_single_hermitian_multiply_1stage_gpu_random_default.sh'; \
+	b='validate_complex_single_hermitian_multiply_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh.log: validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+	@p='validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh'; \
+	b='validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_hermitian_multiply_1stage_gpu_random_default.sh.log: validate_real_single_hermitian_multiply_1stage_gpu_random_default.sh
+	@p='validate_real_single_hermitian_multiply_1stage_gpu_random_default.sh'; \
+	b='validate_real_single_hermitian_multiply_1stage_gpu_random_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_cholesky_1stage_toeplitz_all_layouts_extended.sh.log: validate_complex_double_cholesky_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_cholesky_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_cholesky_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_cholesky_1stage_toeplitz_default.sh.log: validate_complex_double_cholesky_1stage_toeplitz_default.sh
+	@p='validate_complex_double_cholesky_1stage_toeplitz_default.sh'; \
+	b='validate_complex_double_cholesky_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_toeplitz_all_layouts_extended.sh.log: validate_real_double_cholesky_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_cholesky_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_cholesky_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_toeplitz_default.sh.log: validate_real_double_cholesky_1stage_toeplitz_default.sh
+	@p='validate_real_double_cholesky_1stage_toeplitz_default.sh'; \
+	b='validate_real_double_cholesky_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_cholesky_1stage_toeplitz_all_layouts_extended.sh.log: validate_complex_single_cholesky_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_cholesky_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_cholesky_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_cholesky_1stage_toeplitz_default.sh.log: validate_complex_single_cholesky_1stage_toeplitz_default.sh
+	@p='validate_complex_single_cholesky_1stage_toeplitz_default.sh'; \
+	b='validate_complex_single_cholesky_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_cholesky_1stage_toeplitz_all_layouts_extended.sh.log: validate_real_single_cholesky_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_cholesky_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_cholesky_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_cholesky_1stage_toeplitz_default.sh.log: validate_real_single_cholesky_1stage_toeplitz_default.sh
+	@p='validate_real_single_cholesky_1stage_toeplitz_default.sh'; \
+	b='validate_real_single_cholesky_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvalues_1stage_toeplitz_default.sh.log: validate_complex_double_eigenvalues_1stage_toeplitz_default.sh
+	@p='validate_complex_double_eigenvalues_1stage_toeplitz_default.sh'; \
+	b='validate_complex_double_eigenvalues_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh.log: validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+	@p='validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh'; \
+	b='validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_1stage_toeplitz_default.sh.log: validate_real_double_eigenvalues_1stage_toeplitz_default.sh
+	@p='validate_real_double_eigenvalues_1stage_toeplitz_default.sh'; \
+	b='validate_real_double_eigenvalues_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh.log: validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+	@p='validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh'; \
+	b='validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvalues_1stage_toeplitz_default.sh.log: validate_complex_single_eigenvalues_1stage_toeplitz_default.sh
+	@p='validate_complex_single_eigenvalues_1stage_toeplitz_default.sh'; \
+	b='validate_complex_single_eigenvalues_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh.log: validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+	@p='validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh'; \
+	b='validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvalues_1stage_toeplitz_default.sh.log: validate_real_single_eigenvalues_1stage_toeplitz_default.sh
+	@p='validate_real_single_eigenvalues_1stage_toeplitz_default.sh'; \
+	b='validate_real_single_eigenvalues_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh.log: validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+	@p='validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh'; \
+	b='validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_toeplitz_default.sh.log: validate_complex_double_eigenvectors_1stage_toeplitz_default.sh
+	@p='validate_complex_double_eigenvectors_1stage_toeplitz_default.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_toeplitz_default.sh.log: validate_real_double_eigenvectors_1stage_toeplitz_default.sh
+	@p='validate_real_double_eigenvectors_1stage_toeplitz_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_toeplitz_default.sh.log: validate_complex_single_eigenvectors_1stage_toeplitz_default.sh
+	@p='validate_complex_single_eigenvectors_1stage_toeplitz_default.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_toeplitz_default.sh.log: validate_real_single_eigenvectors_1stage_toeplitz_default.sh
+	@p='validate_real_single_eigenvectors_1stage_toeplitz_default.sh'; \
+	b='validate_real_single_eigenvectors_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh.log: validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_default.sh.log: validate_real_double_solve_tridiagonal_1stage_toeplitz_default.sh
+	@p='validate_real_double_solve_tridiagonal_1stage_toeplitz_default.sh'; \
+	b='validate_real_double_solve_tridiagonal_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh.log: validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_solve_tridiagonal_1stage_toeplitz_default.sh.log: validate_real_single_solve_tridiagonal_1stage_toeplitz_default.sh
+	@p='validate_real_single_solve_tridiagonal_1stage_toeplitz_default.sh'; \
+	b='validate_real_single_solve_tridiagonal_1stage_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_cholesky_1stage_gpu_toeplitz_default.sh.log: validate_complex_double_cholesky_1stage_gpu_toeplitz_default.sh
+	@p='validate_complex_double_cholesky_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_complex_double_cholesky_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_cholesky_1stage_gpu_toeplitz_default.sh.log: validate_real_double_cholesky_1stage_gpu_toeplitz_default.sh
+	@p='validate_real_double_cholesky_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_real_double_cholesky_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_cholesky_1stage_gpu_toeplitz_default.sh.log: validate_complex_single_cholesky_1stage_gpu_toeplitz_default.sh
+	@p='validate_complex_single_cholesky_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_complex_single_cholesky_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_cholesky_1stage_gpu_toeplitz_default.sh.log: validate_real_single_cholesky_1stage_gpu_toeplitz_default.sh
+	@p='validate_real_single_cholesky_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_real_single_cholesky_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_default.sh.log: validate_complex_double_eigenvalues_1stage_gpu_toeplitz_default.sh
+	@p='validate_complex_double_eigenvalues_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_complex_double_eigenvalues_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh.log: validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+	@p='validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	b='validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_default.sh.log: validate_real_double_eigenvalues_1stage_gpu_toeplitz_default.sh
+	@p='validate_real_double_eigenvalues_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_real_double_eigenvalues_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh.log: validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+	@p='validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	b='validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_default.sh.log: validate_complex_single_eigenvalues_1stage_gpu_toeplitz_default.sh
+	@p='validate_complex_single_eigenvalues_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_complex_single_eigenvalues_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh.log: validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+	@p='validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	b='validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_default.sh.log: validate_real_single_eigenvalues_1stage_gpu_toeplitz_default.sh
+	@p='validate_real_single_eigenvalues_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_real_single_eigenvalues_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh.log: validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+	@p='validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	b='validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_default.sh.log: validate_complex_double_eigenvectors_1stage_gpu_toeplitz_default.sh
+	@p='validate_complex_double_eigenvectors_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_complex_double_eigenvectors_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh.log: validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh
+	@p='validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh.log: validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+	@p='validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	b='validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_default.sh.log: validate_real_double_eigenvectors_1stage_gpu_toeplitz_default.sh
+	@p='validate_real_double_eigenvectors_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_real_double_eigenvectors_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh.log: validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh
+	@p='validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh'; \
+	b='validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh.log: validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+	@p='validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	b='validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_default.sh.log: validate_complex_single_eigenvectors_1stage_gpu_toeplitz_default.sh
+	@p='validate_complex_single_eigenvectors_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_complex_single_eigenvectors_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh.log: validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh
+	@p='validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh.log: validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+	@p='validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	b='validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_default.sh.log: validate_real_single_eigenvectors_1stage_gpu_toeplitz_default.sh
+	@p='validate_real_single_eigenvectors_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_real_single_eigenvectors_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh.log: validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh
+	@p='validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh'; \
+	b='validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh.log: validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+	@p='validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	b='validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_default.sh.log: validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_default.sh
+	@p='validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh.log: validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh
+	@p='validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	b='validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_default.sh.log: validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_default.sh
+	@p='validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_default.sh'; \
+	b='validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_default.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_autotune_c_version_complex_double_extended.sh.log: validate_autotune_c_version_complex_double_extended.sh
+	@p='validate_autotune_c_version_complex_double_extended.sh'; \
+	b='validate_autotune_c_version_complex_double_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_autotune_c_version_real_double_extended.sh.log: validate_autotune_c_version_real_double_extended.sh
+	@p='validate_autotune_c_version_real_double_extended.sh'; \
+	b='validate_autotune_c_version_real_double_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_autotune_c_version_complex_single_extended.sh.log: validate_autotune_c_version_complex_single_extended.sh
+	@p='validate_autotune_c_version_complex_single_extended.sh'; \
+	b='validate_autotune_c_version_complex_single_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_autotune_c_version_real_single_extended.sh.log: validate_autotune_c_version_real_single_extended.sh
+	@p='validate_autotune_c_version_real_single_extended.sh'; \
+	b='validate_autotune_c_version_real_single_extended.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+validate_autotune_complex_double_extended.sh.log: validate_autotune_complex_double_extended.sh
+	@p='validate_autotune_complex_double_extended.sh'; \
+	b='validate_autotune_complex_double_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa1_test_real_with_c@SUFFIX@.sh.log: elpa1_test_real_with_c@SUFFIX@.sh
-	@p='elpa1_test_real_with_c@SUFFIX@.sh'; \
-	b='elpa1_test_real_with_c@SUFFIX@.sh'; \
+validate_autotune_real_double_extended.sh.log: validate_autotune_real_double_extended.sh
+	@p='validate_autotune_real_double_extended.sh'; \
+	b='validate_autotune_real_double_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_real@SUFFIX@.sh.log: elpa2_test_real@SUFFIX@.sh
-	@p='elpa2_test_real@SUFFIX@.sh'; \
-	b='elpa2_test_real@SUFFIX@.sh'; \
+validate_autotune_complex_single_extended.sh.log: validate_autotune_complex_single_extended.sh
+	@p='validate_autotune_complex_single_extended.sh'; \
+	b='validate_autotune_complex_single_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_real_default_kernel@SUFFIX@.sh.log: elpa2_test_real_default_kernel@SUFFIX@.sh
-	@p='elpa2_test_real_default_kernel@SUFFIX@.sh'; \
-	b='elpa2_test_real_default_kernel@SUFFIX@.sh'; \
+validate_autotune_real_single_extended.sh.log: validate_autotune_real_single_extended.sh
+	@p='validate_autotune_real_single_extended.sh'; \
+	b='validate_autotune_real_single_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa1_test_complex@SUFFIX@.sh.log: elpa1_test_complex@SUFFIX@.sh
-	@p='elpa1_test_complex@SUFFIX@.sh'; \
-	b='elpa1_test_complex@SUFFIX@.sh'; \
+validate_multiple_objs_real_double_extended.sh.log: validate_multiple_objs_real_double_extended.sh
+	@p='validate_multiple_objs_real_double_extended.sh'; \
+	b='validate_multiple_objs_real_double_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_complex@SUFFIX@.sh.log: elpa2_test_complex@SUFFIX@.sh
-	@p='elpa2_test_complex@SUFFIX@.sh'; \
-	b='elpa2_test_complex@SUFFIX@.sh'; \
+test_skewsymmetric_real_double_extended.sh.log: test_skewsymmetric_real_double_extended.sh
+	@p='test_skewsymmetric_real_double_extended.sh'; \
+	b='test_skewsymmetric_real_double_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_complex_default_kernel@SUFFIX@.sh.log: elpa2_test_complex_default_kernel@SUFFIX@.sh
-	@p='elpa2_test_complex_default_kernel@SUFFIX@.sh'; \
-	b='elpa2_test_complex_default_kernel@SUFFIX@.sh'; \
+test_skewsymmetric_real_single_extended.sh.log: test_skewsymmetric_real_single_extended.sh
+	@p='test_skewsymmetric_real_single_extended.sh'; \
+	b='test_skewsymmetric_real_single_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh.log: elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh
-	@p='elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh'; \
-	b='elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh'; \
+validate_multiple_objs_real_double_c_version_extended.sh.log: validate_multiple_objs_real_double_c_version_extended.sh
+	@p='validate_multiple_objs_real_double_c_version_extended.sh'; \
+	b='validate_multiple_objs_real_double_c_version_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh.log: elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh
-	@p='elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh'; \
-	b='elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh'; \
+validate_split_comm_real_double_extended.sh.log: validate_split_comm_real_double_extended.sh
+	@p='validate_split_comm_real_double_extended.sh'; \
+	b='validate_split_comm_real_double_extended.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh.log: elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh
-	@p='elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh'; \
-	b='elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh'; \
+validate_double_instance@SUFFIX@_default.sh.log: validate_double_instance@SUFFIX@_default.sh
+	@p='validate_double_instance@SUFFIX@_default.sh'; \
+	b='validate_double_instance@SUFFIX@_default.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_print_kernels@SUFFIX@.log: elpa2_print_kernels@SUFFIX@$(EXEEXT)
-	@p='elpa2_print_kernels@SUFFIX@$(EXEEXT)'; \
-	b='elpa2_print_kernels@SUFFIX@'; \
+validate_real_2stage_banded@SUFFIX@_default.sh.log: validate_real_2stage_banded@SUFFIX@_default.sh
+	@p='validate_real_2stage_banded@SUFFIX@_default.sh'; \
+	b='validate_real_2stage_banded@SUFFIX@_default.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa1_test_real_c_version@SUFFIX@.sh.log: elpa1_test_real_c_version@SUFFIX@.sh
-	@p='elpa1_test_real_c_version@SUFFIX@.sh'; \
-	b='elpa1_test_real_c_version@SUFFIX@.sh'; \
+validate_complex_2stage_banded@SUFFIX@_default.sh.log: validate_complex_2stage_banded@SUFFIX@_default.sh
+	@p='validate_complex_2stage_banded@SUFFIX@_default.sh'; \
+	b='validate_complex_2stage_banded@SUFFIX@_default.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa1_test_complex_c_version@SUFFIX@.sh.log: elpa1_test_complex_c_version@SUFFIX@.sh
-	@p='elpa1_test_complex_c_version@SUFFIX@.sh'; \
-	b='elpa1_test_complex_c_version@SUFFIX@.sh'; \
+validate_single_real_2stage_banded@SUFFIX@_default.sh.log: validate_single_real_2stage_banded@SUFFIX@_default.sh
+	@p='validate_single_real_2stage_banded@SUFFIX@_default.sh'; \
+	b='validate_single_real_2stage_banded@SUFFIX@_default.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_real_c_version@SUFFIX@.sh.log: elpa2_test_real_c_version@SUFFIX@.sh
-	@p='elpa2_test_real_c_version@SUFFIX@.sh'; \
-	b='elpa2_test_real_c_version@SUFFIX@.sh'; \
+validate_single_complex_2stage_banded@SUFFIX@_default.sh.log: validate_single_complex_2stage_banded@SUFFIX@_default.sh
+	@p='validate_single_complex_2stage_banded@SUFFIX@_default.sh'; \
+	b='validate_single_complex_2stage_banded@SUFFIX@_default.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-elpa2_test_complex_c_version@SUFFIX@.sh.log: elpa2_test_complex_c_version@SUFFIX@.sh
-	@p='elpa2_test_complex_c_version@SUFFIX@.sh'; \
-	b='elpa2_test_complex_c_version@SUFFIX@.sh'; \
+test_python.sh.log: test_python.sh
+	@p='test_python.sh'; \
+	b='test_python.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
@@ -2278,7 +20355,10 @@
 @am__EXEEXT_TRUE@	$(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \
 @am__EXEEXT_TRUE@	"$$tst" $(AM_TESTS_FD_REDIRECT)
 
-distdir: $(DISTFILES)
+distdir: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) distdir-am
+
+distdir-am: $(DISTFILES)
 	$(am__remove_distdir)
 	test -d "$(distdir)" || mkdir "$(distdir)"
 	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
@@ -2318,7 +20398,7 @@
 	  ! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \
 	|| chmod -R a+r "$(distdir)"
 dist-gzip: distdir
-	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
+	tardir=$(distdir) && $(am__tar) | eval GZIP= gzip $(GZIP_ENV) -c >$(distdir).tar.gz
 	$(am__post_remove_distdir)
 
 dist-bzip2: distdir
@@ -2344,7 +20424,7 @@
 	@echo WARNING: "Support for shar distribution archives is" \
 	               "deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
-	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
+	shar $(distdir) | eval GZIP= gzip $(GZIP_ENV) -c >$(distdir).shar.gz
 	$(am__post_remove_distdir)
 
 dist-zip: distdir
@@ -2362,7 +20442,7 @@
 distcheck: dist
 	case '$(DIST_ARCHIVES)' in \
 	*.tar.gz*) \
-	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).tar.gz | $(am__untar) ;;\
+	  eval GZIP= gzip $(GZIP_ENV) -dc $(distdir).tar.gz | $(am__untar) ;;\
 	*.tar.bz2*) \
 	  bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\
 	*.tar.lz*) \
@@ -2372,7 +20452,7 @@
 	*.tar.Z*) \
 	  uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
 	*.shar.gz*) \
-	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).shar.gz | unshar ;;\
+	  eval GZIP= gzip $(GZIP_ENV) -dc $(distdir).shar.gz | unshar ;;\
 	*.zip*) \
 	  unzip $(distdir).zip ;;\
 	esac
@@ -2446,12 +20526,12 @@
 	$(MAKE) $(AM_MAKEFLAGS) check-TESTS
 check: $(BUILT_SOURCES)
 	$(MAKE) $(AM_MAKEFLAGS) check-am
-all-am: Makefile $(LTLIBRARIES) $(PROGRAMS) $(MANS) $(DATA) $(HEADERS) \
+all-am: Makefile $(PROGRAMS) $(LTLIBRARIES) $(MANS) $(DATA) $(HEADERS) \
 		config.h
 install-binPROGRAMS: install-libLTLIBRARIES
 
 installdirs:
-	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)" "$(DESTDIR)$(man3dir)" "$(DESTDIR)$(docdir)" "$(DESTDIR)$(filesdir)" "$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(elpa_includedir)"; do \
+	for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(libdir)" "$(DESTDIR)$(pyelpadir)" "$(DESTDIR)$(pyelpadir)" "$(DESTDIR)$(man1dir)" "$(DESTDIR)$(man3dir)" "$(DESTDIR)$(docdir)" "$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(elpa_includedir)" "$(DESTDIR)$(elpa_includedir)"; do \
 	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
 	done
 install: $(BUILT_SOURCES)
@@ -2485,20 +20565,38 @@
 distclean-generic:
 	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
 	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+	-rm -f python/pyelpa/$(DEPDIR)/$(am__dirstamp)
+	-rm -f python/pyelpa/$(am__dirstamp)
 	-rm -f src/$(DEPDIR)/$(am__dirstamp)
 	-rm -f src/$(am__dirstamp)
-	-rm -f src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp)
-	-rm -f src/elpa2_kernels/$(am__dirstamp)
-	-rm -f src/elpa_qr/$(DEPDIR)/$(am__dirstamp)
-	-rm -f src/elpa_qr/$(am__dirstamp)
+	-rm -f src/GPU/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/GPU/$(am__dirstamp)
+	-rm -f src/elpa1/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/elpa1/$(am__dirstamp)
+	-rm -f src/elpa2/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/elpa2/$(am__dirstamp)
+	-rm -f src/elpa2/GPU/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/elpa2/GPU/$(am__dirstamp)
+	-rm -f src/elpa2/kernels/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/elpa2/kernels/$(am__dirstamp)
+	-rm -f src/elpa2/qr/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/elpa2/qr/$(am__dirstamp)
+	-rm -f src/elpa_generalized/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/elpa_generalized/$(am__dirstamp)
 	-rm -f src/ftimings/$(DEPDIR)/$(am__dirstamp)
 	-rm -f src/ftimings/$(am__dirstamp)
-	-rm -f test/c_test_programs/$(DEPDIR)/$(am__dirstamp)
-	-rm -f test/c_test_programs/$(am__dirstamp)
-	-rm -f test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp)
-	-rm -f test/fortran_test_programs/$(am__dirstamp)
-	-rm -f test/shared_sources/$(DEPDIR)/$(am__dirstamp)
-	-rm -f test/shared_sources/$(am__dirstamp)
+	-rm -f src/general/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/general/$(am__dirstamp)
+	-rm -f src/helpers/$(DEPDIR)/$(am__dirstamp)
+	-rm -f src/helpers/$(am__dirstamp)
+	-rm -f test/C/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/C/$(am__dirstamp)
+	-rm -f test/Fortran/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/Fortran/$(am__dirstamp)
+	-rm -f test/Fortran/elpa2/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/Fortran/elpa2/$(am__dirstamp)
+	-rm -f test/shared/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/shared/$(am__dirstamp)
 
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
@@ -2507,11 +20605,105 @@
 clean: clean-am
 
 clean-am: clean-binPROGRAMS clean-generic clean-libLTLIBRARIES \
-	clean-libtool clean-local clean-noinstPROGRAMS mostlyclean-am
+	clean-libtool clean-local clean-noinstLTLIBRARIES \
+	clean-noinstPROGRAMS clean-pyelpaLTLIBRARIES mostlyclean-am
 
 distclean: distclean-am
 	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-	-rm -rf src/elpa2_kernels/$(DEPDIR) src/ftimings/$(DEPDIR) test/c_test_programs/$(DEPDIR) test/shared_sources/$(DEPDIR)
+		-rm -f python/pyelpa/$(DEPDIR)/wrapper_la-wrapper.Plo
+	-rm -f src/$(DEPDIR)/elpa_c_interface.Plo
+	-rm -f src/$(DEPDIR)/elpa_index.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_1hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_1hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx512_1hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx512_1hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx512_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx512_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_sse_1hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_sse_1hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_sse_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_sse_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_6hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_6hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_6hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sparc64_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sparc64_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sparc64_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_6hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_6hv_single_precision.Plo
+	-rm -f src/elpa_generalized/$(DEPDIR)/cannon.Plo
+	-rm -f src/ftimings/$(DEPDIR)/highwater_mark.Plo
+	-rm -f src/ftimings/$(DEPDIR)/papi.Plo
+	-rm -f src/ftimings/$(DEPDIR)/resident_set_size.Plo
+	-rm -f src/ftimings/$(DEPDIR)/time.Plo
+	-rm -f src/ftimings/$(DEPDIR)/virtual_memory.Plo
+	-rm -f src/helpers/$(DEPDIR)/get_cpuid_set.Plo
+	-rm -f src/helpers/$(DEPDIR)/print_build_config.Plo
+	-rm -f test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Po
+	-rm -f test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Po
+	-rm -f test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Po
+	-rm -f test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Po
+	-rm -f test/shared/$(DEPDIR)/test_redir.Plo
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-hdr distclean-libtool distclean-local distclean-tags
@@ -2528,9 +20720,11 @@
 
 info-am:
 
-install-data-am: install-dist_docDATA install-dist_filesDATA \
-	install-man install-nobase_elpa_includeHEADERS \
-	install-pkgconfigDATA
+install-data-am: install-dist_docDATA install-man \
+	install-nobase_elpa_includeHEADERS \
+	install-nobase_nodist_elpa_includeHEADERS \
+	install-pkgconfigDATA install-pyelpaLTLIBRARIES \
+	install-pyelpaPYTHON
 
 install-dvi: install-dvi-am
 
@@ -2561,7 +20755,100 @@
 maintainer-clean: maintainer-clean-am
 	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
 	-rm -rf $(top_srcdir)/autom4te.cache
-	-rm -rf src/elpa2_kernels/$(DEPDIR) src/ftimings/$(DEPDIR) test/c_test_programs/$(DEPDIR) test/shared_sources/$(DEPDIR)
+		-rm -f python/pyelpa/$(DEPDIR)/wrapper_la-wrapper.Plo
+	-rm -f src/$(DEPDIR)/elpa_c_interface.Plo
+	-rm -f src/$(DEPDIR)/elpa_index.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_1hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_1hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx-avx2_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx512_1hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx512_1hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx512_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_avx512_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_sse_1hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_sse_1hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_sse_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/complex_sse_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx-avx2_6hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_avx512_6hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_neon_arch64_6hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sparc64_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sparc64_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sparc64_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_sse_6hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_2hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_2hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_4hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_4hv_single_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_6hv_double_precision.Plo
+	-rm -f src/elpa2/kernels/$(DEPDIR)/real_vsx_6hv_single_precision.Plo
+	-rm -f src/elpa_generalized/$(DEPDIR)/cannon.Plo
+	-rm -f src/ftimings/$(DEPDIR)/highwater_mark.Plo
+	-rm -f src/ftimings/$(DEPDIR)/papi.Plo
+	-rm -f src/ftimings/$(DEPDIR)/resident_set_size.Plo
+	-rm -f src/ftimings/$(DEPDIR)/time.Plo
+	-rm -f src/ftimings/$(DEPDIR)/virtual_memory.Plo
+	-rm -f src/helpers/$(DEPDIR)/get_cpuid_set.Plo
+	-rm -f src/helpers/$(DEPDIR)/print_build_config.Plo
+	-rm -f test/C/$(DEPDIR)/validate_autotune_c_version_complex_double-test_autotune.Po
+	-rm -f test/C/$(DEPDIR)/validate_autotune_c_version_complex_single-test_autotune.Po
+	-rm -f test/C/$(DEPDIR)/validate_autotune_c_version_real_double-test_autotune.Po
+	-rm -f test/C/$(DEPDIR)/validate_autotune_c_version_real_single-test_autotune.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_double_generalized_decomp_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_complex_single_generalized_decomp_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_eigenvectors_2stage_default_kernel_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_generalized_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_double_generalized_decomp_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_eigenvectors_2stage_default_kernel_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_generalized_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_gpu_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_c_version_real_single_generalized_decomp_1stage_random-test.Po
+	-rm -f test/C/$(DEPDIR)/validate_multiple_objs_real_double_c_version-test_multiple_objs.Po
+	-rm -f test/shared/$(DEPDIR)/test_redir.Plo
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
@@ -2579,96 +20866,119 @@
 ps-am:
 
 uninstall-am: uninstall-binPROGRAMS uninstall-dist_docDATA \
-	uninstall-dist_filesDATA uninstall-libLTLIBRARIES \
-	uninstall-man uninstall-nobase_elpa_includeHEADERS \
-	uninstall-pkgconfigDATA
+	uninstall-libLTLIBRARIES uninstall-man \
+	uninstall-nobase_elpa_includeHEADERS \
+	uninstall-nobase_nodist_elpa_includeHEADERS \
+	uninstall-pkgconfigDATA uninstall-pyelpaLTLIBRARIES \
+	uninstall-pyelpaPYTHON
 
 uninstall-man: uninstall-man1 uninstall-man3
 
 .MAKE: all check check-am install install-am install-strip
 
-.PHONY: CTAGS GTAGS TAGS all all-am am--refresh check check-TESTS \
-	check-am clean clean-binPROGRAMS clean-cscope clean-generic \
-	clean-libLTLIBRARIES clean-libtool clean-local \
-	clean-noinstPROGRAMS cscope cscopelist-am ctags ctags-am dist \
-	dist-all dist-bzip2 dist-gzip dist-lzip dist-shar dist-tarZ \
-	dist-xz dist-zip distcheck distclean distclean-compile \
-	distclean-generic distclean-hdr distclean-libtool \
-	distclean-local distclean-tags distcleancheck distdir \
-	distuninstallcheck dvi dvi-am html html-am info info-am \
-	install install-am install-binPROGRAMS install-data \
-	install-data-am install-dist_docDATA install-dist_filesDATA \
-	install-dvi install-dvi-am install-exec install-exec-am \
-	install-html install-html-am install-info install-info-am \
+.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles am--refresh check \
+	check-TESTS check-am clean clean-binPROGRAMS clean-cscope \
+	clean-generic clean-libLTLIBRARIES clean-libtool clean-local \
+	clean-noinstLTLIBRARIES clean-noinstPROGRAMS \
+	clean-pyelpaLTLIBRARIES cscope cscopelist-am ctags ctags-am \
+	dist dist-all dist-bzip2 dist-gzip dist-lzip dist-shar \
+	dist-tarZ dist-xz dist-zip distcheck distclean \
+	distclean-compile distclean-generic distclean-hdr \
+	distclean-libtool distclean-local distclean-tags \
+	distcleancheck distdir distuninstallcheck dvi dvi-am html \
+	html-am info info-am install install-am install-binPROGRAMS \
+	install-data install-data-am install-dist_docDATA install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am \
 	install-libLTLIBRARIES install-man install-man1 install-man3 \
-	install-nobase_elpa_includeHEADERS install-pdf install-pdf-am \
-	install-pkgconfigDATA install-ps install-ps-am install-strip \
+	install-nobase_elpa_includeHEADERS \
+	install-nobase_nodist_elpa_includeHEADERS install-pdf \
+	install-pdf-am install-pkgconfigDATA install-ps install-ps-am \
+	install-pyelpaLTLIBRARIES install-pyelpaPYTHON install-strip \
 	installcheck installcheck-am installdirs maintainer-clean \
 	maintainer-clean-generic mostlyclean mostlyclean-compile \
 	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
 	recheck tags tags-am uninstall uninstall-am \
 	uninstall-binPROGRAMS uninstall-dist_docDATA \
-	uninstall-dist_filesDATA uninstall-libLTLIBRARIES \
-	uninstall-man uninstall-man1 uninstall-man3 \
-	uninstall-nobase_elpa_includeHEADERS uninstall-pkgconfigDATA
+	uninstall-libLTLIBRARIES uninstall-man uninstall-man1 \
+	uninstall-man3 uninstall-nobase_elpa_includeHEADERS \
+	uninstall-nobase_nodist_elpa_includeHEADERS \
+	uninstall-pkgconfigDATA uninstall-pyelpaLTLIBRARIES \
+	uninstall-pyelpaPYTHON
 
 .PRECIOUS: Makefile
 
+
+# Cuda files
+.cu.lo:
+	NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/  -c $< -o $@
 define extract_interface
-	@echo "Generating $@...";
+	@echo "Extracting interface marked with '$1' from $@...";
 	@grep -h "^ *$1" $^ | sed 's/^ *$1//;' >> $@ || { rm $@; exit 1; }
 endef
 
-elpa test:
-	mkdir $@
-
-test/shared_sources: | test
-	mkdir $@
+elpa test src:
+	@mkdir $@
 
+test/shared: | test
+	@mkdir $@
 config-f90.h: config.h
 	@echo "Generating $@...";
 	@grep "^#define" $^ > $@ || { rm $@; exit 1; }
-
-elpa/elpa_generated.h: $(top_srcdir)/src/elpa_c_interface.F90 | elpa
+elpa/elpa_generated.h: $(top_srcdir)/src/elpa_impl.F90 \
+                       $(top_srcdir)/src/elpa_impl_math_template.F90 \
+                       $(top_srcdir)/src/elpa_api.F90 | elpa
+	@rm -f $@
 	$(call extract_interface,!c>)
-
-test/shared_sources/generated.h: $(wildcard $(top_srcdir)/test/shared_sources/*.F90) | test/shared_sources
+	$(call extract_interface,!c_o>)
+	$(call extract_interface,!c_no>)
+test/shared/generated.h: $(wildcard $(top_srcdir)/test/shared/*.*90) | test/shared
+	@rm -f $@
 	$(call extract_interface,!c>)
-
-elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c) $(wildcard $(top_srcdir)/src/elpa2_kernels/*.s) | elpa
+src/elpa_generated_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/helpers/*.c $(top_srcdir)/src/elpa2/kernels/*.c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch] $(top_srcdir)/src/elpa_generalized/*.[ch])) | src
+	@rm -f $@
 	$(call extract_interface,!f>)
 	$(call extract_interface,#!f>)
-generated-headers: $(generated_headers)
-%.sh: %
-	echo '$(wrapper)./$^ $$TEST_FLAGS' > $@
-	chmod +x $@
-
-#elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh:
-#	echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > $@
-#	chmod +x $@
-
-#elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@.sh:
-#	echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > $@
-#	chmod +x $@
-
-# Preprocessed files (just used for manual inspection)
-elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@
+src/elpa_generated_public_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/*.[ch])) | src
+	@rm -f $@
+	$(call extract_interface,!pf>)
+src/fortran_constants.F90: $(top_srcdir)/src/fortran_constants.h | src
+	@$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -I. $< -o $@_ || { rm -f $@; exit 1; }
+	@awk '/!ELPA_C_DEFINE/ {gsub(/!ELPA_C_DEFINE/, "\n"); gsub(/NEWLINE/, "\n"); print;}' < $@_ > $@ || { rm -f $@; exit 1; }
+	@rm $@_
 
-elpa2.i: $(top_srcdir)/src/elpa2.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2.F90 -o $@
-
-elpa1.i: $(top_srcdir)/src/elpa1.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
-
-elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@
+generated-headers: $(generated_headers)
 
-mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
+# vim: syntax=make
 
-mod_compute_hh_trafo_complex.i: $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90
-	$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 -o $@
+python/pyelpa/wrapper.c: python/pyelpa/wrapper.pyx
+	cython $< -o $@
+# test scripts
+TASKS ?= 2
+%_extended.sh: %
+	@echo "#!/bin/bash" > $@
+	@echo 'if [ "$$CHECK_LEVEL" = "extended" ] ; then $(wrapper) ./$^ $$TEST_FLAGS ; else exit 77; fi' >> $@
+	@chmod +x $@
+
+%_default.sh: %
+	@echo "#!/bin/bash" > $@
+	@echo '$(wrapper)' ./$^ '$$TEST_FLAGS' >> $@
+	@chmod +x $@
+
+test_python.sh:
+	@echo '#!/bin/bash' > $@
+# this is kind of hacky... is there a better way to get wrapper.so?
+	@echo 'export PYTHONPATH=./python-copy:$$PYTHONPATH' >> $@
+	@echo 'cp -r $(abs_top_srcdir)/python python-copy || exit 1' >> $@
+	@echo 'chmod u+rwX -R python-copy || exit 1' >> $@
+	@echo 'cp .libs/wrapper.so python-copy/pyelpa/ || exit 1' >> $@
+# the dlopen flags are needed for MKL to work properly...
+# only in os from python 3.3 on
+	@echo "$(wrapper) $(PYTHON) -c 'import sys, os; sys.setdlopenflags(os.RTLD_NOW | os.RTLD_GLOBAL); import pytest; sys.exit(pytest.main([\"./python-copy\", \"-p\", \"no:cacheprovider\"]))'" >> $@
+	@echo 'exit_code=$$?' >> $@
+	@echo 'rm -rf python-copy || exit 1' >> $@
+	@echo 'exit $$exit_code' >> $@
+	@chmod +x $@
 
 @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps
 
@@ -2717,20 +21027,24 @@
 @DX_COND_doc_TRUE@	$(DX_ENV) $(DX_DOXYGEN) $(DX_CONFIG)
 
 clean-local:
-	-rm -rf modules/* .fortran_dependencies/*
+	-rm -rf modules/* private_modules/* test_modules/* .fortran_dependencies/*
+	-rm -rf validate_*.sh
+	-rm -rf real_2stage*.sh
+	-rm -rf complex_2stage*.sh
+	-rm -rf single_complex_2stage*.sh
+	-rm -rf single_real_2stage*.sh
+	-rm -rf double_instance_onenode*.sh
 	-rm -rf $(generated_headers)
 
 distclean-local:
+	-rm -rf ./m4
+	-rm -rf ./src
+	-rm -rf ./test
+	-rm -rf ./modules
+	-rm -rf .fortran_dependencies
 	-rm config-f90.h
-	-rm -rf ./src/elpa2_kernels/.deps
-	-rm -rf ./src/.deps
 	-rm -rf ./test/.deps
-	-rmdir ./src/elpa2_kernels/
-	-rmdir ./src
-	-rmdir ./test
-	-rmdir ./m4
-	-rmdir modules/
-	-rmdir .fortran_dependencies/
+	-rm -rf elpa/elpa_generated_c_api.h
 libtool: $(LIBTOOL_DEPS)
 	$(SHELL) ./config.status libtool
 
diff -Nru elpa-2016.05.001/man/elpa2_print_kernels.1 elpa-2019.11.001/man/elpa2_print_kernels.1
--- elpa-2016.05.001/man/elpa2_print_kernels.1	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/man/elpa2_print_kernels.1	2019-12-19 09:47:44.000000000 +0000
@@ -23,5 +23,5 @@
 .SH "Reporting bugs"
 Report bugs to the ELPA mail elpa-library@mpcdf.mpg.de
 .SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real\fP(3) \fBsolve_evp_complex\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3)
+\fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
 
diff -Nru elpa-2016.05.001/man/elpa_allocate.3 elpa-2019.11.001/man/elpa_allocate.3
--- elpa-2016.05.001/man/elpa_allocate.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_allocate.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,52 @@
+.TH "elpa_allocate" 3 "Sat Jun 3 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_allocate \- allocate an instance of the ELPA library
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "elpa => \fBelpa_allocate\fP (error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t)     :: \fBelpa\fP  ! returns an instance of the ELPA object"
+.br
+.RI "integer, optional :: \fBerror\fp ! a returned error code 
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "\fBelpa_t\fP handle = \fBelpa_allocate\fP(\fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "elpa_t \fBhandle\fP;  // returns an handle to the allocated ELPA object"
+.br
+.RI "int \fB*error\fP;     // a returned error code
+.br
+
+
+.SH DESCRIPTION
+Allocate an ELPA object. The function \fBelpa_init\fP(3) must be called once \fIBEFORE\fP \fBelpa_allocate\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_autotune_deallocate.3 elpa-2019.11.001/man/elpa_autotune_deallocate.3
--- elpa-2016.05.001/man/elpa_autotune_deallocate.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_autotune_deallocate.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,56 @@
+.TH "elpa_autotune_deallocate" 3 "Tue Nov 28 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_autotune_deallocate \- Deallocates an ELPA autotuning instance
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+class(elpa_autotune_t), pointer :: tune_state
+.br
+
+.RI  "call\fBelpa%autotune_deallocate\fP (tune_state, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "type(elpa_autotune_t) :: \fBtune_state\fP  !  the ELPA autotuning object, created with \fBelpa_autotune_setup\fP(3)
+.br
+.RI "integer, optional     :: \fBerror\fP       ! the returned error code
+.br
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+elpa_autotune_t autotune_handle;
+
+.br
+.RI "void \fBelpa_autotune_deallocate\fP (\fBelpa_t\fP handle, \fBelpa_autotune_t\fP autotune_handle, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "elpa_t \fBhandle\fP;  // the handle of an ELPA object, obtained before with \fBelpa_allocate\fP(3)"
+.br
+.RI "elpa_autotune_t \fBautotune_handle\fP;  // the handle of an ELPA object, obtained before with \fBelpa_autotune_setup\fP(3)"
+.br
+.RI "int \fB*error\fP; // the returned error code"
+.br
+.br
+.SH DESCRIPTION
+Deallocates an ELPA autotuning instance.  \fIPrior\fP to calling the elpa_autotune_deallocate method, an ELPA autotuning object must have been created. See \fBelpa_autotune_setup\fP(3)
+.SH "SEE ALSO"
+.br
+\fBelpa_autotune_step\fP(3) \fBelpa_autotune_setup\fP(3) \fBelpa_autotune_deallocate\fp(3)
+
diff -Nru elpa-2016.05.001/man/elpa_autotune_load_state.3 elpa-2019.11.001/man/elpa_autotune_load_state.3
--- elpa-2016.05.001/man/elpa_autotune_load_state.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_autotune_load_state.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,64 @@
+.TH "elpa_autotune_load_state" 3 "Tue Nov 13 2018" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_autotune_load_state \- loadsa state of an ELPA autotuning object
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+class(elpa_autotune_t), pointer :: autotune
+.br
+
+.RI  "call elpa%\fBautotune_load_state\fP (autotune, filename, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "class(elpa_t)          \fBelpa\f:         an instance of the ELPA object"
+.br
+.RI "class(elpa_autotune_t) \fBautotune\f:     an instance of the ELPA autotune object"
+.br
+.RI "character(*)           \fBfilename\fP:   The filename to be used for loading the settings"
+.br
+.RI "integer, optional      \fBerror\fP:      A error return code"
+.br
+
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+elpa_autotune_t autotune_handle;
+
+.br
+.RI "void \fBelpa_autotune_load_state\fP(\fBelpa_t\fP handle, \fBelpa_autotune_t\fP autotune_handle, \fBconst char\fP *filename, \fBint\fP *error):
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "elpa_t \fBhandle\fP:                The handle to the ELPA object"
+.br
+.TP
+.RI "elpa_autotune_t \fBhandle\fP:       The handle to the ELPA autotune object"
+.br
+.TP
+.RI "const char \fB*filename\fP:         the filename to load the settings"
+.br
+.RI "int \fB*error\fP:                   the error return code"
+.TP
+
+.SH DESCRIPTION
+Loads a previously stored state of an autotune object. With the loaded state the autotunig could be resumed
+.SH "SEE ALSO"
+.br
+\fBelpa_autotune_save_state\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_autotune_print_state.3 elpa-2019.11.001/man/elpa_autotune_print_state.3
--- elpa-2016.05.001/man/elpa_autotune_print_state.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_autotune_print_state.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,59 @@
+.TH "elpa_autotune_print_state" 3 "Tue Nov 13 2018" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_autotune_print_state \- prints the current state of an ELPA autotuning object
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+class(elpa_autotune_t), pointer :: autotune
+.br
+
+.RI  "call elpa%\fBautotune_print_state\fP (autotune, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "class(elpa_t)          \fBelpa\f:         an instance of the ELPA object"
+.br
+.RI "class(elpa_autotune_t) \fBautotune\f:     an instance of the ELPA autotune object"
+.br
+.RI "integer, optional      \fBerror\fP:      A error return code"
+.br
+
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+elpa_autotune_t autotune_handle;
+
+.br
+.RI "void \fBelpa_autotune_print_state\fP(\fBelpa_t\fP handle, \fBelpa_autotune_t\fP autotune_handle, \fBint\fP *error):
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "elpa_t \fBhandle\fP:                The handle to the ELPA object"
+.br
+.TP
+.RI "elpa_autotune_t \fBhandle\fP:       The handle to the ELPA autotune object"
+.br
+.RI "int \fB*error\fP:                   the error return code"
+.TP
+
+.SH DESCRIPTION
+Prints the current state of an autotune object. 
+.SH "SEE ALSO"
+.br
+\fBelpa_autotune_save_state\fP(3) \fBelpa_autotune_load_state\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_autotune_save_state.3 elpa-2019.11.001/man/elpa_autotune_save_state.3
--- elpa-2016.05.001/man/elpa_autotune_save_state.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_autotune_save_state.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,64 @@
+.TH "elpa_autotune_save_state" 3 "Tue Nov 13 2018" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_autotune_save_state \- saves the current state of an ELPA autotuning object
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+class(elpa_autotune_t), pointer :: autotune
+.br
+
+.RI  "call elpa%\fBautotune_save_state\fP (autotune, filename, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "class(elpa_t)          \fBelpa\f:         an instance of the ELPA object"
+.br
+.RI "class(elpa_autotune_t) \fBautotune\f:     an instance of the ELPA autotune object"
+.br
+.RI "character(*)           \fBfilename\fP:   The filename to be used for storing the settings"
+.br
+.RI "integer, optional      \fBerror\fP:      A error return code"
+.br
+
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+elpa_autotune_t autotune_handle;
+
+.br
+.RI "void \fBelpa_autotune_save_state\fP(\fBelpa_t\fP handle, \fBelpa_autotune_t\fP autotune_handle, \fBconst char\fP *filename, \fBint\fP *error):
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "elpa_t \fBhandle\fP:                The handle to the ELPA object"
+.br
+.TP
+.RI "elpa_autotune_t \fBhandle\fP:       The handle to the ELPA autotune object"
+.br
+.TP
+.RI "const char \fB*filename\fP:         the filename to store the settings"
+.br
+.RI "int \fB*error\fP:                   the error return code"
+.TP
+
+.SH DESCRIPTION
+Saves the current state of an autotune object. The state can be restored with \fBelpa_autotune_load_state\fP.3 and the autoutuning could be resumed.
+.SH "SEE ALSO"
+.br
+\fBelpa_autotune_load_state\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_autotune_set_best.3 elpa-2019.11.001/man/elpa_autotune_set_best.3
--- elpa-2016.05.001/man/elpa_autotune_set_best.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_autotune_set_best.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,51 @@
+.TH "elpa_autotune_set_best" 3 "Tue Nov 28 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_autotune_set_best \- Sets the tunable parameters to the up-to-now best solution
+.br
+
+Before the autotuning options can be set, an autotuning step has to be done \fBelpa_autotune_step\fP(3)
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+class(elpa_autotune_t), pointer :: tune_state
+.br
+
+.RI  "call\fBelpa%autotune_set_best\fP (tune_state)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "type(elpa_autotune_t) :: \fBtune_state\fP  !  the ELPA autotuning object, created with \fBelpa_autotune_setup\fP(3)
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+elpa_autotune_t autotune_handle;
+
+.br
+.RI "void \fBelpa_autotune_set_best\fP (\fBelpa_t\fP handle, \fBelpa_autotune_t\fP autotune_handle);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "elpa_t \fBhandle\fP;  // the handle of an ELPA object, obtained before with \fBelpa_allocate\fP(3)"
+.br
+.RI "elpa_autotune_t \fBautotune_handle\fP;  // the handle of an ELPA object, obtained before with \fBelpa_autotune_setup\fP(3)"
+
+.SH DESCRIPTION
+Sets the up-to-now best options for ELPA tunable parameters. \fIPrior\fP to calling the elpa_autotune_set_best method, an ELPA autotuning step must have been performed. See \fBelpa_autotune_set_best\fP(3)
+.SH "SEE ALSO"
+.br
+\fBelpa_autotune_step\fP(3) \fBelpa_autotune_setup\fP(3) \fBelpa_autotune_deallocate\fp(3)
+
diff -Nru elpa-2016.05.001/man/elpa_autotune_setup.3 elpa-2019.11.001/man/elpa_autotune_setup.3
--- elpa-2016.05.001/man/elpa_autotune_setup.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_autotune_setup.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,57 @@
+.TH "elpa_autotune_setup" 3 "Tue Nov 28 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_autotune_setup \- create an instance for autotuning of the ELPA library
+.br
+
+Before the autotuning object can be created, an instance of the ELPA library has to be setup, see e.g. \fBelpa_setup\fP(3)
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+class(elpa_autotune_t), pointer :: tune_state
+.br
+
+.RI  "tune_state= \fBelpa%autotune_setup\fP (level, domain)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "integer :: \fBlevel\fP  !  the level of the autotuning, at the moment ELPA_AUTOTUNE_FAST is supported"
+.br
+.RI "integer :: \fBdomain\fP  !  the domain (real or complex) of the autotuning, can be either ELPA_AUTOTUNE_DOMAIN_REAL or ELPA_AUTOTUNE_DOMAIN_COMPLEX"
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+elpa_autotune_t autotune_handle;
+
+.br
+.RI "\fBelpa_autotune_t\fP autotune_handle = \fBelpa_autotune_setup\fP (\fBelpa_t\fP handle, int level, int domain);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "elpa_t \fBhandle\fP;  // the handle of an ELPA object, obtained before with \fBelpa_allocate\fP(3)"
+.br
+.RI "int \fBlevel\fP;      // the level of the autotuning, at the moment "ELPA_AUTOTUNE_FAST" is supported
+.br
+.RI "int \fBdomain\fP;      // the domain (real or complex) of the autotuning, can be either "ELPA_AUTOTUNE_DOMAIN_REAL"  and "ELPA_AUTOTUNE_DOMAIN_COMPLEX"
+.br
+.RI "elpa_autotune_t \fBautotune_handel\fP;    // the created handle of the autotune object
+
+.SH DESCRIPTION
+Creates an ELPA autotuning object. \fIPrior\fP to calling  the autotune_setup, an ELPA object must have been created. See \fBelpa_setup\fP(3)
+.SH "SEE ALSO"
+.br
+\fBelpa_autotune_step\fP(3) \fBelpa_autotune_set_best\fP(3) \fBelpa_autotune_deallocate\fp(3)
+
diff -Nru elpa-2016.05.001/man/elpa_autotune_step.3 elpa-2019.11.001/man/elpa_autotune_step.3
--- elpa-2016.05.001/man/elpa_autotune_step.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_autotune_step.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,56 @@
+.TH "elpa_autotune_step" 3 "Tue Nov 28 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_autotune_step \- do one ELPA autotuning step
+.br
+
+Before the autotuning step can be done, an instance of the ELPA autotune object has to be created, see \fBelpa_autotune_setup\fP(3)
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+class(elpa_autotune_t), pointer :: tune_state
+.br
+
+.RI  "unfinished = \fBelpa%autotune_step\fP (tune_state)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "type(elpa_autotune_t) :: \fBtune_state\fP  !  the ELPA autotuning object, created with \fBelpa_autotune_setup\fP(3)
+.br
+.RI "logical :: \fBunfinished\fP  !  logical, specifying whether autotuning has finined (.false.) or not (.true.)
+.br
+
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+elpa_autotune_t autotune_handle;
+
+.br
+.RI "\fBint\fP unfinished = \fBelpa_autotune_step\fP (\fBelpa_t\fP handle, \fBelpa_autotune_t\fP autotune_handle);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "elpa_t \fBhandle\fP;  // the handle of an ELPA object, obtained before with \fBelpa_allocate\fP(3)"
+.br
+.RI "elpa_autotune_t \fBautotune_handle\fP;  // the handle of the autotuning object, created with \fBelpa_autotune_setup\fP(3)
+.br
+.RI "int \fBunfinished\fP;      // int, specifying whether autotuning has finined (0) or not (1)
+
+.SH DESCRIPTION
+Does an ELPA autotuning step. \fIPrior\fP to calling  the autotune_step, an ELPA autotune object must have been created. See \fBelpa_autotune_setup\fP(3)
+.SH "SEE ALSO"
+.br
+\fBelpa_autotune_setup\fP(3) \fBelpa_autotune_set_best\fP(3) \fBelpa_autotune_deallocate\fp(3)
+
diff -Nru elpa-2016.05.001/man/elpa_cholesky.3 elpa-2019.11.001/man/elpa_cholesky.3
--- elpa-2016.05.001/man/elpa_cholesky.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_cholesky.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,58 @@
+.TH "elpa_cholesky" 3 "Sat Jul 15 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_cholesky \- do a Cholesky factorization of a real symmetric or complex hermitian matrix
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBcholesky\fP (a, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a which should be decomposed. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_cholesky\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix which should be decomposed. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Compute the Cholesky decomposition of a real symmtric or complex hermitian matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_eigenvalues\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fB(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_deallocate.3 elpa-2019.11.001/man/elpa_deallocate.3
--- elpa-2016.05.001/man/elpa_deallocate.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_deallocate.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,52 @@
+.TH "elpa_deallocate" 3 "Sat Jun 3 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_deallocate \- deallocate an instance of the ELPA library after usage
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call \fBelpa_deallocate\fP (\fBclass(elpa_t)\fP elpa, \fBinteger\fP error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t)     :: \fBelpa\fP  ! the pointer to the instance of the ELPA library which should be deallocated"
+.br
+.RI "integer, optional :: \fBerror\fP ! the returned error code"
+.br
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "\fBvoid\fP \fBelpa_deallocate\fP(\fBelpa_t\fP handle, \fBint\fP *error^);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "elpa_t \fBhandle\fP; // the handle to the ELPA instance which should be deallocated."
+.br
+.RI "int \fB*error\fP;    // the returned error code"
+.br
+.br
+.SH DESCRIPTION
+Deallocate an ELPA object. The functions \fBelpa_init\fP(3) and \fBelpa_allocate\fP(3) must have been called \fIBEFORE\fP \fBelpa_deallocate\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_eigenvalues.3 elpa-2019.11.001/man/elpa_eigenvalues.3
--- elpa-2016.05.001/man/elpa_eigenvalues.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_eigenvalues.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,67 @@
+.TH "elpa_eigenvalues" 3 "Sat Jul 15 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_eigenvalues \- computes the eigenvalues of a real symmetric or complex hermitian matrix
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBeigenvalues\fP (a, ev, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t) :: \fBelpa\fP  ! returns an instance of the ELPA object"
+.br
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a for which the eigenvalues should be computed. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)". The matrix has to be symmetric or hermitian, this is not checked by the routine.
+.TP
+.RI "datatype :: \fBev\fP"
+The vector ev where the eigenvalues will be stored in \fIascending\fP order. The datatype of the vector ev can be either "real(kind=c_double)", or "real(kind=c_float)", depending of the datatype of the matrix. Note that complex hermitian matrices also have real valued eigenvalues.
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_eigenvalues\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBdatatype\fP *ev, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix a for which the eigenvalues should be computed. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex". The matrix has to be symmetric or hermitian, this is not checked by the routine.
+.TP
+.RI "datatype *\fBev\fP;"
+The storage for the computed eigenvalues. Eigenvalues will be stored in \fIascendig\fP order. The \fBdatatype\fP can be either "double" or "float". Note that the eigenvalues of complex hermitian matrices are also real.
+.TP
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Compute the eigenvalues of a real symmetric or complex hermitian matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_eigenvalues\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_skew_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_skew_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_eigenvectors.3 elpa-2019.11.001/man/elpa_eigenvectors.3
--- elpa-2016.05.001/man/elpa_eigenvectors.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_eigenvectors.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,72 @@
+.TH "elpa_eigenvectors" 3 "Sat Jul 15 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_eigenvectors \- computes the eigenvalues and (part of) the eigenvector spectrum for a real symmetric or complex hermitian matrix
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBeigenvectors\fP (a, ev, q, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t) :: \fBelpa\fP  ! returns an instance of the ELPA object"
+.br
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a for which the eigenvalues should be computed. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)". The matrix has to be symmetric or hermitian, this is not checked by the routine.
+.TP
+.RI "datatype :: \fBev\fP"
+The vector ev where the eigenvalues will be stored in \fIascending\fP order. The datatype of the vector ev can be either "real(kind=c_double)", or "real(kind=c_float)", depending of the datatype of the matrix. Note that complex hermitian matrices also have real valued eigenvalues.
+.RI "datatype :: \fBq\fP"
+The storage space for the computed eigenvectors. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_eigenvalues\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBdatatype\fP *ev, \fBdatatype\fP *q, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix a for which the eigenvalues should be computed. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex". The matrix has to be symmetric or hermitian, this is not checked by the routine.
+.TP
+.RI "datatype *\fBev\fP;"
+The storage for the computed eigenvalues. Eigenvalues will be stored in \fIascendig\fP order. The \fBdatatype\fP can be either "double" or "float". Note that the eigenvalues of complex hermitian matrices are also real.
+.TP
+.RI "datatype *\fBq\fP;"
+The storage space for the computed eigenvectors. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
+.TP
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Compute the eigenvalues and (parts of) the eigenvector spectrum of a real symmetric or complex hermitian matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_eigenvalues\fP can be called. Especially the number of eigenvectors to be computed can be set with \fPelpa_set\fB(3)
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_skew_eigenvalues\fP(3) \fBelpa_skew_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_generalized_eigenvalues.3 elpa-2019.11.001/man/elpa_generalized_eigenvalues.3
--- elpa-2016.05.001/man/elpa_generalized_eigenvalues.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_generalized_eigenvalues.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,82 @@
+.TH "elpa_generalized_eigenvalues" 3 "Wed Mar 14 2018" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_generalized_eigenvalues \- computes the eigenvalues of a generalized eigenvalue problem for real symmetric or complex hermitian matrices
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBgeneralized_eigenvalues\fP (a, b, ev, is_already_decomopsed, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t) :: \fBelpa\fP  ! returns an instance of the ELPA object"
+.br
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a for which the eigenvalues should be computed. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "datatype :: \fBb\fP"
+The matrix b defining the generalized eigenvalue problem. The dimensions and datatype of the matrix b has to be the same as for matrix a.
+.TP
+.RI "datatype :: \fBev\fP"
+The vector ev where the eigenvalues will be stored in \fIascending\fP order. The datatype of the vector ev can be either "real(kind=c_double)", or "real(kind=c_float)", depending of the datatype of the matrix. Note that complex hermitian matrices also have real valued eigenvalues.
+.TP
+.RI "logical :: \fBis_already_decomposed\fP"
+Has to be set to .false. for the first call with a given b and .true. for
+each subsequent call with the same b, since b then already contains
+decomposition and thus the decomposing step is skipped.
+
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_generalized_eigenvalues\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBdatatype\fP *b, \fBdatatype\fP *ev, \fBint\fP is_already_decomposed, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix a for which the eigenvalues should be computed. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
+.TP
+.RI "datatype *\fBb\fP;"
+The matrix b defining the generalized eigenvalue problem. The dimensions and the datatype of the matrix b must be the same as matrix a.
+.TP   
+.RI "datatype *\fBev\fP;"
+The storage for the computed eigenvalues. Eigenvalues will be stored in \fIascendig\fP order. The \fBdatatype\fP can be either "double" or "float". Note that the eigenvalues of complex hermitian matrices are also real.
+.TP
+.RI "int \fBis_already_decomposed\fP;"
+Has to be set to 0 for the first call with a given b and 1 for each subsequent call with the same b, since b then already contains decomposition and thus the decomposing step is skipped.
+.TP
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Compute the generalized eigenvalues and (parts of) the eigenvector spectrum of a real symmtric or complex hermitian matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_generalized_eigenvalues\fP can be called. Especially the number of eigenvectors to be computed can be set with \fPelpa_set\fB(3). Unlike in the case of ordinary eigenvalue problem, the generalized problem calls some external scalapack routines. The user is responsible for initialization of the blacs context, which then has to be passed to elpa by \fPelpa_set\fB(3) \fIBEFORE\fP \fBelpa_generalized_eigenvalues\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_generalized_eigenvectors.3 elpa-2019.11.001/man/elpa_generalized_eigenvectors.3
--- elpa-2016.05.001/man/elpa_generalized_eigenvectors.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_generalized_eigenvectors.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,88 @@
+.TH "elpa_generalized_eigenvectors" 3 "Thu Feb 1 2018" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_generalized_eigenvectors \- computes the generalized eigenvalues and (part of) the eigenvector spectrum for a real symmetric or complex hermitian matrix
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBgeneralized_eigenvectors\fP (a, b, ev, q, is_already_decomopsed, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t) :: \fBelpa\fP  ! returns an instance of the ELPA object"
+.br
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a for which the eigenvalues should be computed. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "datatype :: \fBb\fP"
+The matrix b defining the generalized eigenvalue problem. The dimensions and datatype of the matrix b has to be the same as for matrix a.
+.TP
+.RI "datatype :: \fBev\fP"
+The vector ev where the eigenvalues will be stored in \fIascending\fP order. The datatype of the vector ev can be either "real(kind=c_double)", or "real(kind=c_float)", depending of the datatype of the matrix. Note that complex hermitian matrices also have real valued eigenvalues.
+.TP
+.RI "datatype :: \fBq\fP"
+The storage space for the computed eigenvectors. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "logical :: \fBis_already_decomposed\fP"
+Has to be set to .false. for the first call with a given b and .true. for
+each subsequent call with the same b, since b then already contains
+decomposition and thus the decomposing step is skipped.
+
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_generalized_eigenvectors\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBdatatype\fP *b, \fBdatatype\fP *ev, \fBdatatype\fP *q,  \fBint\fP is_already_decomposed, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix a for which the eigenvalues should be computed. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
+.TP
+.RI "datatype *\fBb\fP;"
+The matrix b defining the generalized eigenvalue problem. The dimensions and the datatype of the matrix b must be the same as matrix a.
+.TP
+.RI "datatype *\fBev\fP;"
+The storage for the computed eigenvalues. Eigenvalues will be stored in \fIascendig\fP order. The \fBdatatype\fP can be either "double" or "float". Note that the eigenvalues of complex hermitian matrices are also real.
+.TP
+.RI "datatype *\fBq\fP;"
+The storage space for the computed eigenvectors. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
+.TP
+.RI "int \fBis_already_decomposed\fP;"
+Has to be set to 0 for the first call with a given b and 1 for each subsequent call with the same b, since b then already contains decomposition and thus the decomposing step is skipped.
+.TP
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Compute the generalized eigenvalues and (parts of) the eigenvector spectrum of a real symmtric or complex hermitian matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_generalized_eigenvalues\fP can be called. Especially the number of eigenvectors to be computed can be set with \fPelpa_set\fB(3). Unlike in the case of ordinary eigenvalue problem, the generalized problem calls some external scalapack routines. The user is responsible for initialization of the blacs context, which then has to be passed to elpa by \fPelpa_set\fB(3) \fIBEFORE\fP \fBelpa_generalized_eigenvalues\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_hermitian_multiply.3 elpa-2019.11.001/man/elpa_hermitian_multiply.3
--- elpa-2016.05.001/man/elpa_hermitian_multiply.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_hermitian_multiply.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,115 @@
+.TH "elpa_hermitian_multiply" 3 "Sat Jul 15 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_hermitian_multiply \- Performs C = A**H * B
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBhermitian_multiply\fP (uplo_a, uplo_c, ncb, a, b, nrows_b, ncols_b, &
+                                          c, nrows_c, ncols_c, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "character*1 :: \fBuplo_a\fP"
+set to 'U' if A is upper triangular, 'L' if A is lower triangular or anything else if A is a full matrix
+.TP
+.RI "character*1 :: \fBuplo_c\fP"
+set to 'U' if only the upper diagonal part of C is needed, to 'L' if only the upper diagonal part of C is needed, or to anything else if the full matrix C is needed
+.TP
+.RI "integer :: \fBncb\fP"
+The number of columns of the global matrices b and c
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "datatype :: \fBb\fP"
+The matrix b. The dimensions of the matrix are specified by the parametes \fBnrows_b\fP and \fBncols_b\fP. The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "integer :: \fBnrows_b\fP"
+The number of rows of matrix b
+.TP
+.RI "integer :: \fBncols_b\fP"
+The number of columns of matrix b
+.TP
+.RI "datatype :: \fBc\fP"
+The matrix c. The dimensions of the matrix are specified by the parametes \fBnrows_c\fP and \fBncols_c\fP. The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "integer :: \fBnrows_c\fP"
+The number of rows of matrix c
+.TP
+.RI "integer :: \fBncols_c\fP"
+The number of columns of matrix c
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_hermitian_multiply\fP(\fBelpa_t\fP handle, \fBchar\fP uplo_a, \fBchar\fP uplo_c, \fBint\fP ncb, \fBdatatype\fP *a, \fBdatatype\fP *b, \fBint\fP nrows_b, \fBint\fP ncols_b, \fBdatatype\fP *c, \fBint\fP nrows_c, \fBint\fP ncols_c, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "char \fBuplo_a\fP;"
+set to 'U' if A is upper triangular, 'L' if A is lower triangular or anything else if A is a full matrix
+.TP
+.RI "char \fBuplo_c\fP;"
+set to 'U' if only the upper diagonal part of C is needed, to 'L' if only the upper diagonal part of C is needed, or to anything else if the full matrix C is needed
+.TP
+.RI "int \fBncb\fP;"
+The number of columns of the global matrices b and c
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix a. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "double", "float", "double complex", or "float complex"
+.TP
+.RI "datatype *\fBb\fP;"
+The matrix b. The dimensions of the matrix are specified by the parametes \fBnrows_b\fP and \fBncols_b\fP. The datatype of the matrix can be one of "double", "float", "double complex", or "float complex"
+.TP
+.RI "int \fBnrows_b\fP;"
+The number of rows of matrix b
+.TP
+.RI "int \fBncols_b\fP;"
+The number of columns of matrix b
+.TP
+.RI "datatype *\fBc\fP;"
+The matrix c. The dimensions of the matrix are specified by the parametes \fBnrows_c\fP and \fBncols_c\fP. The datatype of the matrix can be one of "double", "float", "double complex", or "float complex"
+.TP
+.RI "int \fBnrows_c\fP;"
+The number of rows of matrix c
+.TP
+.RI "int \fBncols_c\fP;"
+The number of columns of matrix c
+.TP
+.RI "int *\fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+
+.SH DESCRIPTION
+Performa a "hermitian" multiplication C = A**T * B for real matrices and C=A**H * B for complex matrices. The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_hermitian_multiply\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_init.3 elpa-2019.11.001/man/elpa_init.3
--- elpa-2016.05.001/man/elpa_init.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_init.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,53 @@
+.TH "elpa_init" 3 "Sat Jun 3 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_init \- initialize the ELPA library
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "error = \fBelpa_init\fP (api_version)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "integer, intent(in) :: \fBapi_version\fP  ! the api version that you want to initialize, currently the version is 20171201"
+.br
+.RI "integer             :: \fBerror\fP        ! the return code. If the function returns without an error, the error code will be ELPA_OK."
+.br
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "\fBint\fP error = \fBelpa_init\fP (\fBint\fP api_version);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "int \fBapi_version\fP;  // the api version that you want to initialize currently the version is 20171201"
+.br
+.RI "int \fBerror\fP;        // the return code. If the function returns without an error, the error code will be ELPA_OK."
+
+.SH DESCRIPTION
+Initializes the ELPA library for usage. The return code shold be ELPA_OK. The return code can be querried
+with the \fBelpa_strerr\fP(3) function.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_allocate\fP(3) \fPelpa_set\fP(3) \fPelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_choleksy\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fPelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_invert_triangular.3 elpa-2019.11.001/man/elpa_invert_triangular.3
--- elpa-2016.05.001/man/elpa_invert_triangular.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_invert_triangular.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,58 @@
+.TH "elpa_invert_triangular" 3 "Sat Jul 15 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_invert_triangular \- Invert an upper triangular matrix
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBinvert_triangular\fP (a, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a which should be inverted. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_invert_triangular\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix which should be inverted. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Inverts an upper triangular real or complex matrix. The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_eigenvalues\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_choleksy\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_load_settings.3 elpa-2019.11.001/man/elpa_load_settings.3
--- elpa-2016.05.001/man/elpa_load_settings.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_load_settings.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,57 @@
+.TH "elpa_load_settings" 3 "Tue Nov 13 2018" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_load_settings \- loads the setting of an elpa object
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBload_settings\fP (filename, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "class(elpa_t)       \fBelpa\f:         returns an instance of the ELPA object"
+.br
+.RI "character(*)        \fBfilename\fP:   The file from where to load the settings"
+.br
+.RI "integer, optinal    \fBerror\fP:      An error return code"
+.br
+
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_load_settings\fP(\fBelpa_t\fP handle, \fBconst char\fP *filename, \fBint\fP *error):
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "elpa_t \fBhandle\fP:       The handle to the ELPA object"
+.br
+.TP
+.RI "const char \fB*filename\fP: the filename to load the settings"
+.br
+.RI "int \fB*error\fP:           the error return code"
+.TP
+
+.SH DESCRIPTION
+Loads all the settings of an previously stored ELPA object from a file specified via the \fBfilename\fP parameter. 
+.SH "SEE ALSO"
+.br
+\fBelpa_store_setting\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_print_settings.3 elpa-2019.11.001/man/elpa_print_settings.3
--- elpa-2016.05.001/man/elpa_print_settings.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_print_settings.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,52 @@
+.TH "elpa_print_settings" 3 "Tue Nov 20 2018" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_print_settings \- prints the setting of an elpa object
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBprint_settings\fP (error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "class(elpa_t)       \fBelpa\f:         returns an instance of the ELPA object"
+.br
+.RI "integer, optinal    \fBerror\fP:      An error return code"
+.br
+
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_print_settings\fP(\fBelpa_t\fP handle, \fBint\fP *error):
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "elpa_t \fBhandle\fP:       The handle to the ELPA object"
+.br
+.RI "int \fB*error\fP:          the error return code"
+.TP
+
+.SH DESCRIPTION
+Prints all the settings of an ELPA object. The settings can be stored, or loaded with \fBelpa_store_settings\fP.3 or \fBelpa_load_settings\fP.3 
+.SH "SEE ALSO"
+.br
+\fBelpa_store_setting\fP(3) \fBelpa_load_settings\fP.(3)
diff -Nru elpa-2016.05.001/man/elpa_set.3 elpa-2019.11.001/man/elpa_set.3
--- elpa-2016.05.001/man/elpa_set.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_set.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,130 @@
+.TH "elpa_set" 3 "Sat Jun 3 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_set \- set parameter or tunables for the ELPA library
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call \fBelpa%set\fP (\fBcharacter(*)\fP name, \fBdatatype\fP value, \fBinteger\fP error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "character(*) :: \fBname\fP"
+the name of the option to be set
+.br
+.TP
+.RI "datatype :: \fBvalue\fP"
+the value which should be assigned to the option \fBname\fP. The datatype can be \fBinteger\fP or \fBreal(kind=c_double)\fP.
+.br
+.TP
+.RI "integer, optional :: \fBerror\fP"
+the returned error code. On success it is ELPA_OK, otherwise an error. The error code can be querried with \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "\fBvoid\fP \fBelpa_set\fP (\fBelpa_t\fP handle, \fBconst char\fP *name, \fBdatatype\fP value, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+the handle of an ELPA object, obtained before with \fBelpa_allocate\fP(3)"
+.br
+.TP
+.RI "const char \fB*name\fP;"
+the name of the option to be set.
+.br
+.TP
+.RI "datatype \fBvalue\fP;"
+the value which should be assigned to the option \fBname\fP. Datatype can be either \fBint\fP or \fBdouble\fP."
+
+.SH DESCRIPTION
+The elpa_set function is used to set \fBparameters\fP and \fBtunables\fP for the run-time of the ELPA library. It returns an error code which can be querried with \fBelpa_strerr\fP(3).
+
+\fBParameters:\fP
+
+Parameters of an ELPA instance have to be set \fIBEFORE\fP the ELPA instance is set up with the function \fBelpa_setup\fP(3).
+
+At the moment the following parameters are supported:
+.br
+.TP
+.RI \fB"na"\fP:
+integer parameter. The global matrix has size (na * na)
+.TP
+.RI \fB"nev"\fP:
+integer parameter. The number of eigenvectors to be computed in a call to \fBelpa_eigenvectors\fP(3). Must have dimension 1 <= nev <= na.
+.TP
+.RI \fB"local_nrows"\fP:
+integer parameter. Number of matrix rows stored on this MPI process.
+.TP
+.RI \fB"local_ncols"\fP:
+integer parameter. Number of matrix cols stored on this MPI process.
+.TP
+.RI \fB"process_row"\fP:
+integer parameter. Process row number in the 2D domain decomposition.
+.TP
+.RI \fB"process_col"\fP:
+integer parameter. Process col number in the 2D domain decomposition.
+.TP
+.RI \fB"mpi_comm_parent"\fP:
+integer parameter. The parent MPI communicator which includes all MPI process which are used in the 2D domain decomposition.
+.TP
+.RI \fB"bandwidth"\fP:
+integer parameter. Some ELPA compute steps can be accelerated if the matrix is already in banded form. If set, ELPA assumes that the bandwidth of the matrix is the value set.
+.TP
+.RI \fB"blacs_context"\fP:
+integer parameter. The generalized eigenvalue solver \fBelpa_generalized_eigenvectors\fP(3) use internal calls to some of the scalapack routines. Thus before calling it, the user has to provide properly initialized blacs context.
+.TP
+.RI \fB"timings"\fP:
+Choose whether time measurements should be done in the ELPA routines.
+
+.LP
+\fBTunables:\fP
+
+Tunables of an ELPA option can be set at \fIanytime\fP.
+
+At the moment the following parameters are supported:
+.br
+.TP
+.RI \fB"solver"\fP:
+Choose which solver should be used in the compute steps \fBelpa_eigenvalues\fP(3) or \fBelpa_eigenvectors\fP(3). At the moment allowed option are \fB"ELPA_SOLVER_1STAGE"\fP or \fB"ELPA_SOLVER_2STAGE"\fP.
+.TP
+.RI \fB"real_kernel"\fP:
+Choose which real kernel should be used in the \fBelpa_eigenvalues\fP(3) or \fBelpa_eigenvectors\fP(3) compute steps, if solver is set to \fB"ELPA_SOLVER_2STAGE"\fP. The available kernels can be querried with \fBelpa2_print_kernels\fP(1).
+.TP
+.RI \fB"complex_kernel"\fP:
+Choose which complex kernel should be used in the \fBelpa_eigenvalues\fP(3) or \fBelpa_eigenvectors\fP(3) compute steps, if solver is set to \fB"ELPA_SOLVER_2STAGE"\fP. The available kernels can be querried with \fBelpa2_print_kernels\fP(1).
+.TP
+.RI \fB"qr"\fP:
+Choose whether in the real case computations in \fBelpa_eigenvalues\fP(3) or \fBelpa_eigenvectors\fP(3) compute steps, if solver is set to \fB"ELPA_SOLVER_2STAGE"\fP, a QR decompostion should be used.
+.TP
+.RI \fB"qpu"\fP:
+Choose whether accelerated GPU calculations should be used. Only available if ELPA has been build with GPU support.
+.TP
+.RI \fB"debug"\fP:
+Choose whether, in case of an error, more debug information should be provided.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_deallocate\fP(3) \fBelpa_uninit\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_setup.3 elpa-2019.11.001/man/elpa_setup.3
--- elpa-2016.05.001/man/elpa_setup.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_setup.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,50 @@
+.TH "elpa_setup" 3 "Sat Jun 3 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_setup \- setup an instance of the ELPA library
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "success= \fBelpa%setup\fP ()"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "integer :: \fBsucces\fP  !  the returned error code. Should normally be ELPA_OK. Can be querried with \fBelpa_strerr\fP(3)"
+.br
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "\fBint\fP success = \fBelpa_setup\fP (\fBelpa_t\fP handle);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "elpa_t \fBhandle\fP;  // the handle of an ELPA object, obtained before with \fBelpa_allocate\fP(3)"
+.br
+.RI "int \fBsuccess\fP;    // the returned error code. Should normally be ELPA_OK. Can be querried with \fBelpa_strerr\fP(3)"
+
+.SH DESCRIPTION
+Setups an ELPA object. \fIPrior\fP to calling  the setup, the functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3) \fImust have been called\fP and some parameters must have been set with \fBelpa_set\fP(3).
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_deallocate\fP(3) \fBelpa_uninit\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_skew_eigenvalues.3 elpa-2019.11.001/man/elpa_skew_eigenvalues.3
--- elpa-2016.05.001/man/elpa_skew_eigenvalues.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_skew_eigenvalues.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,67 @@
+.TH "elpa_skew_eigenvalues" 3 "Thur Nov 7 2019" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_skew_eigenvalues \- computes the eigenvalues of a real skew-symmetric matrix
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBskew_eigenvalues\fP (a, ev, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t) :: \fBelpa\fP  ! returns an instance of the ELPA object"
+.br
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a for which the eigenvalues should be computed. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)" or "real(kind=c_float)". The matrix has to be skew-symmetric, this is not checked by the routine.
+.TP
+.RI "datatype :: \fBev\fP"
+The vector ev where the eigenvalues will be stored in \fIascending\fP order. The datatype of the vector ev can be either "real(kind=c_double)", or "real(kind=c_float)", depending of the datatype of the matrix.
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_skew_eigenvalues\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBdatatype\fP *ev, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix a for which the eigenvalues should be computed. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double" or "float".
+.TP
+.RI "datatype *\fBev\fP;"
+The storage for the computed eigenvalues. Eigenvalues will be stored in \fIascendig\fP order. The \fBdatatype\fP can be either "double" or "float". Note that the eigenvalues of complex hermitian matrices are also real. The matrix has to be skew-symmetric, this is not checked by the routine.
+.TP
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Compute the eigenvalues of a real skew-symmetric matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_skew_eigenvalues\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_skew_eigenvectors\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_skew_eigenvectors.3 elpa-2019.11.001/man/elpa_skew_eigenvectors.3
--- elpa-2016.05.001/man/elpa_skew_eigenvectors.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_skew_eigenvectors.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,73 @@
+.TH "elpa_skew_eigenvectors" 3 "Thur Nov 7 2019" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_skew_eigenvectors \- computes the eigenvalues and (part of) the eigenvector spectrum for a real skew-symmetric matrix
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBskew_eigenvectors\fP (a, ev, q, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t) :: \fBelpa\fP  ! returns an instance of the ELPA object"
+.br
+.TP
+.RI "datatype :: \fBa\fP"
+The matrix a for which the eigenvalues should be computed. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)". The matrix has to be skew-symmetric, this is not checked by the routine.
+.TP
+.RI "datatype :: \fBev\fP"
+The vector ev where the eigenvalues will be stored in \fIascending\fP order. The datatype of the vector ev can be either "real(kind=c_double)", or "real(kind=c_float)", depending of the datatype of the matrix. Note that complex hermitian matrices also have real valued eigenvalues.
+.RI "datatype :: \fBq\fP"
+The storage space for the computed eigenvectors. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)". Note, that for a skew-symmetric matrix the eigenvectors are complex. The routines returns separately the real and imaginary parts of the complex eigenvectors. Thus, the storage space has to be of dimension q(#numer_of_rows,2*#number_of_column).
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_eigenvalues\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBdatatype\fP *ev, \fBdatatype\fP *q, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBa\fP;"
+The matrix a for which the eigenvalues should be computed. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex". The matrix has to be symmetric or hermitian, this is not checked by the routine.
+.TP
+.RI "datatype *\fBev\fP;"
+The storage for the computed eigenvalues. Eigenvalues will be stored in \fIascendig\fP order. The \fBdatatype\fP can be either "double" or "float". Note that the eigenvalues of complex hermitian matrices are also real.
+.TP
+.RI "datatype *\fBq\fP;"
+The storage space for the computed eigenvectors. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".  Note, that for a skew-symmetric matrix the eigenvectors are complex. The routines returns separately the real and imaginary parts of the complex eigenvectors. Thus, the storage space has to be of dimension q(#numer_of_rows,2*#number_of_column).
+
+.TP
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Compute the eigenvalues and (parts of) the eigenvector spectrum of a real symmetric or complex hermitian matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_eigenvalues\fP can be called. Especially the number of eigenvectors to be computed can be set with \fPelpa_set\fB(3)
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_skew_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_solve_tridiagonal.3 elpa-2019.11.001/man/elpa_solve_tridiagonal.3
--- elpa-2016.05.001/man/elpa_solve_tridiagonal.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_solve_tridiagonal.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,72 @@
+.TH "elpa_solve_tridiagonal" 3 "Sat Jul 15 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_solve_tridiagonal \- computes the eigenvalue problem for real symmetric tridiagonal matrix
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBeigenvectors\fP (d, e, q, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "class(elpa_t) :: \fBelpa\fP  ! returns an instance of the ELPA object"
+.br
+.TP
+.RI "datatype :: \fBd\fP"
+The diagonal elements of a matrix whose dimensions have been defined in \fBelpa_setup\fP(3). On exist the eigenvalues are stored in this 1d-array. The datatype of the diagonal elements can either be "real(kind=c_double)" or "real(kind=c_float)".
+.TP
+.RI "datatype :: \fBe\fP"
+The offdiagonal elements of the matrix. The datatype of the diagonal elements can either be "real(kind=c_double)" or "real(kind=c_float)".
+.RI "datatype :: \fBq\fP"
+The storage space for the computed eigenvectors. The datatype of the matrix can be either "real(kind=c_double)" or "real(kind=c_float)".
+.TP
+.RI "integer, optional :: \fBerror\fP"
+The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_eigenvalues\fP(\fBelpa_t\fP handle, \fBdatatype\fP *d, \fBdatatype\fP *e, \fBdatatype\fP *q, \fBint\fP *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.TP
+.RI "elpa_t \fBhandle\fP;"
+The handle to the ELPA object
+.TP
+.RI "datatype *\fBd\fP;"
+The diagonal elements of the matrix. The dimensions of the matrix must be set \fIBEFORE\fP with \fBelpa_setup\fP(3). On exist the eigenvalues are stored in d. The \fBdatatype\fP can be one of "double" or "float".
+.TP
+.RI "datatype *\fBe\fP;"
+The offdiagonal elements of the matrix. The \fBdatatype\fP can be one of "double" or "float".
+.TP
+.RI "datatype *\fBq\fP;"
+The storage space for the computed eigenvectors. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
+.TP
+.RI "int *\fBerror\fP;"
+The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
+
+.SH DESCRIPTION
+Computes the eigenvalue problem of a real symmtric tridiagonal matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_solve_tridiagonal\fP can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_store_settings.3 elpa-2019.11.001/man/elpa_store_settings.3
--- elpa-2016.05.001/man/elpa_store_settings.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_store_settings.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,58 @@
+.TH "elpa_store_settings" 3 "Tue Nov 13 2018" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_store_settings \- stores the setting of an elpa object
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call elpa%\fBstore_settings\fP (filename, error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "class(elpa_t)       \fBelpa\f:         returns an instance of the ELPA object"
+.br
+.RI "character(*)        \fBfilename\fP:   The filename to be used for storing the settings"
+.br
+.RI "integer, optinal    \fBerror\fP:      An error return code"
+.br
+
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "void \fBelpa_store_settings\fP(\fBelpa_t\fP handle, \fBconst char\fP *filename, \fBint\fP *error):
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+.br
+.br
+.TP
+.RI "elpa_t \fBhandle\fP:       The handle to the ELPA object"
+.br
+.TP
+.RI "const char \fB*filename\fP: the filename to store the settings"
+.br
+.RI "int \fB*error\fP:           the error return code"
+.TP
+
+.SH DESCRIPTION
+Stores all the settings of an ELPA object in a human readable form to a file specified via the \fBfilename\fP parameter. The settings can later be restored with the
+\fBelpa_load_settings\fP(3) method. 
+.SH "SEE ALSO"
+.br
+\fBelpa_load_setting\fP(3)
diff -Nru elpa-2016.05.001/man/elpa_uninit.3 elpa-2019.11.001/man/elpa_uninit.3
--- elpa-2016.05.001/man/elpa_uninit.3	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/man/elpa_uninit.3	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,50 @@
+.TH "elpa_uninit" 3 "Sat Jun 3 2017" "ELPA" \" -*- nroff -*-
+.ad l
+.nh
+.SH NAME
+elpa_uninit \- uninitialize the ELPA library
+.br
+
+.SH SYNOPSIS
+.br
+.SS FORTRAN INTERFACE
+use elpa
+.br
+class(elpa_t), pointer :: elpa
+.br
+
+.RI  "call \fBelpa_uninit\fP (error)"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.RI "error     integer, optional error code"
+.br
+.br
+
+.br
+.SS C INTERFACE
+#include <elpa/elpa.h>
+.br
+elpa_t handle;
+
+.br
+.RI "\fBvoid\fP \fBelpa_uninit\fP (int *error);"
+.br
+.RI " "
+.br
+.RI "With the definitions of the input and output variables:"
+
+.br
+.br
+.RI "\fBint *\fP    error : the error code"
+.br
+
+.SH DESCRIPTION
+Uninitializes the ELPA library after usage. The function \fBelpa_init\fP(3) must have been called \fIBEFORE\fP elpa_uninit can be called.
+.br
+.SH "SEE ALSO"
+.br
+\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_setup\fP(3) \fBelpa_deallocate\fP(3)
diff -Nru elpa-2016.05.001/man/get_elpa_communicators.3 elpa-2019.11.001/man/get_elpa_communicators.3
--- elpa-2016.05.001/man/get_elpa_communicators.3	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/man/get_elpa_communicators.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,59 +0,0 @@
-.TH "get_elpa_communicators" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-get_elpa_communicators \- get the MPI row and column communicators needed in ELPA
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-
-.br
-.RI "success = \fBget_elpa_communicators\fP (mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols)"
-
-.br
-.br
-.RI "integer, intent(in)   \fBmpi_comm_global\fP:  global communicator for the calculation"
-.br
-.RI "integer, intent(in)   \fBmy_prow\fP:          row coordinate of the calling process in the process grid"
-.br
-.RI "integer, intent(in)   \fBmy_pcol\fP:          column coordinate of the calling process in the process grid"
-.br
-.RI "integer, intent(out)  \fBmpi_comm_row\fP:     communicator for communication within rows of processes"
-.br
-.RI "integer, intent(out)  \fBmpi_comm_row\fP:     communicator for communication within columns of processes"
-.br
-
-.RI "integer               \fBsuccess\fP:          return value indicating success or failure of the underlying MPI_COMM_SPLIT function"
-
-.SS C INTERFACE
-#include "elpa_generated.h"
-
-.br
-.RI "success = \fBget_elpa_communicators\fP (int mpi_comm_world, int my_prow, my_pcol, int *mpi_comm_rows, int *Pmpi_comm_cols);"
-
-.br
-.br
-.RI "int \fBmpi_comm_global\fP:  global communicator for the calculation"
-.br
-.RI "int \fBmy_prow\fP:          row coordinate of the calling process in the process grid"
-.br
-.RI "int \fBmy_pcol\fP:          column coordinate of the calling process in the process grid"
-.br
-.RI "int *\fBmpi_comm_row\fP:    pointer to the communicator for communication within rows of processes"
-.br
-.RI "int *\fBmpi_comm_row\fP:    pointer to the communicator for communication within columns of processes"
-.br
-
-.RI "int  \fBsuccess\fP:         return value indicating success or failure of the underlying MPI_COMM_SPLIT function"
-
-
-
-
-.SH DESCRIPTION
-All ELPA routines need MPI communicators for communicating within rows or columns of processes. These communicators are created from the \fBmpi_comm_global\fP communicator. It is assumed that the matrix used in ELPA is distributed with \fBmy_prow\fP rows and \fBmy_pcol\fP columns on the calling process. This function has to be envoked by all involved processes before any other calls to ELPA routines.
-.br
-.SH "SEE ALSO"
-\fBsolve_evp_real\fP(3) \fBsolve_evp_complex\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBelpa2_print_kernels\fP(1)
diff -Nru elpa-2016.05.001/man/get_elpa_row_col_comms.3 elpa-2019.11.001/man/get_elpa_row_col_comms.3
--- elpa-2016.05.001/man/get_elpa_row_col_comms.3	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/man/get_elpa_row_col_comms.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,61 +0,0 @@
-.TH "get_elpa_row_col_comms" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-get_elpa_row_col_comms \- old, deprecated interface to get the MPI row and column communicators needed in ELPA.
-It is recommended to use \fBget_elpa_communicators\fP(3)
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-
-.br
-.RI "success = \fBget_elpa_row_col_comms\fP (mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols)"
-
-.br
-.br
-.RI "integer, intent(in)   \fBmpi_comm_global\fP:  global communicator for the calculation"
-.br
-.RI "integer, intent(in)   \fBmy_prow\fP:          row coordinate of the calling process in the process grid"
-.br
-.RI "integer, intent(in)   \fBmy_pcol\fP:          column coordinate of the calling process in the process grid"
-.br
-.RI "integer, intent(out)  \fBmpi_comm_row\fP:     communicator for communication within rows of processes"
-.br
-.RI "integer, intent(out)  \fBmpi_comm_row\fP:     communicator for communication within columns of processes"
-.br
-
-.RI "integer               \fBsuccess\fP:          return value indicating success or failure of the underlying MPI_COMM_SPLIT function"
-
-.SS C INTERFACE
-#include "elpa_generated.h"
-
-.br
-.RI "success = \fBelpa_get_communicators\fP (int mpi_comm_world, int my_prow, my_pcol, int *mpi_comm_rows, int *Pmpi_comm_cols);"
-
-.br
-.br
-.RI "int \fBmpi_comm_global\fP:  global communicator for the calculation"
-.br
-.RI "int \fBmy_prow\fP:          row coordinate of the calling process in the process grid"
-.br
-.RI "int \fBmy_pcol\fP:          column coordinate of the calling process in the process grid"
-.br
-.RI "int *\fBmpi_comm_row\fP:    pointer to the communicator for communication within rows of processes"
-.br
-.RI "int *\fBmpi_comm_row\fP:    pointer to the communicator for communication within columns of processes"
-.br
-
-.RI "int  \fBsuccess\fP:         return value indicating success or failure of the underlying MPI_COMM_SPLIT function"
-
-
-
-
-
-.SH DESCRIPTION
-All ELPA routines need MPI communicators for communicating within rows or columns of processes. These communicators are created from the \fBmpi_comm_global\fP communicator. It is assumed that the matrix used in ELPA is distributed with \fBmy_prow\fP rows and \fBmy_pcol\fP columns on the calling process. This function has to be envoked by all involved processes before any other calls to ELPA routines.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real\fP(3) \fBsolve_evp_complex\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBelpa2_print_kernels\fP(1)
diff -Nru elpa-2016.05.001/man/solve_evp_complex_1stage.3 elpa-2019.11.001/man/solve_evp_complex_1stage.3
--- elpa-2016.05.001/man/solve_evp_complex_1stage.3	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/man/solve_evp_complex_1stage.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,88 +0,0 @@
-.TH "solve_evp_complex_1stage" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_complex_1stage \- solve the complex eigenvalue problem with the 1-stage ELPA solver
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-.br
-.br
-.RI  "success = \fBsolve_evp_complex_1stage\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer,     intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer,     intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "complex*16,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer,     intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,      intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "complex*16,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer,     intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)        \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "logical                    \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SS C INTERFACE
-#include "elpa.h"
-.br
-#include <complex.h>
-
-.br
-.RI "success = \fBsolve_evp_complex_1stage\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex*\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols);"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "int             \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "int             \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "double complex *\fBa\fP:             pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "int             \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "double         *\fBev\fP:            pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "double complex *\fBq\fP:             pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "int             \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "int             \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "int             \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "int             \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int             \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "int             \fBsuccess\fP:       return value indicating success (1) or failure (0)
-
-.SH DESCRIPTION
-Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBelpa2_print_kernels\fP(1)
diff -Nru elpa-2016.05.001/man/solve_evp_complex_2stage.3 elpa-2019.11.001/man/solve_evp_complex_2stage.3
--- elpa-2016.05.001/man/solve_evp_complex_2stage.3	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/man/solve_evp_complex_2stage.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,91 +0,0 @@
-.TH "solve_evp_complex_2stage" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_complex_2stage \- solve the complex eigenvalue problem with the 2-stage ELPA solver
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-use elpa2
-.br
-.br
-.RI  "success = \fBsolve_evp_real_2stage\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer,     intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer,     intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "complex*16,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer,     intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,      intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "complex*16,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer,     intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_all\fP:  communicator for all processes in the processor set involved in ELPA"
-.br
-.RI "logical                    \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SS C INTERFACE
-#include "elpa.h"
-.br
-#include <complex.h>
-
-.br
-.RI "success = \fBsolve_evp_complex_2stage\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL);"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "int             \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "int             \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "double complex *\fBa\fP:             pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "int             \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "double         *\fBev\fP:            pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "double complex *\fBq\fP:             pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "int             \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "int             \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "int             \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "int             \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int             \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int             \fBmpi_comm_all\fP:  communicator for all processes in the processor set involved in ELPA"
-.br
-.RI "int             \fBsuccess\fP:       return value indicating success (1) or failure (0)
-
-.SH DESCRIPTION
-Solve the complex eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBelpa2_print_kernels\fP(1)
diff -Nru elpa-2016.05.001/man/solve_evp_complex.3 elpa-2019.11.001/man/solve_evp_complex.3
--- elpa-2016.05.001/man/solve_evp_complex.3	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/man/solve_evp_complex.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,51 +0,0 @@
-.TH "solve_evp_complex" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_complex \- solve the complex eigenvalue problem with the 1-stage ELPA solver.
-This interface is old and deprecated. It is recommended to use \fBsolve_evp_complex_1stage\fP(3)
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-.br
-.br
-.RI  "success = \fBsolve_evp_complex\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer,     intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer,     intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "complex*16,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer,     intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,      intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "complex*16,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer,     intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer,     intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer,     intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)        \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "logical                    \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SH DESCRIPTION
-Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBelpa2_print_kernels\fP(1)
diff -Nru elpa-2016.05.001/man/solve_evp_real_1stage.3 elpa-2019.11.001/man/solve_evp_real_1stage.3
--- elpa-2016.05.001/man/solve_evp_real_1stage.3	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/man/solve_evp_real_1stage.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,86 +0,0 @@
-.TH "solve_evp_real_1stage" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_real_1stage \- solve the real eigenvalue problem with the 1-stage ELPA solver
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-.br
-.br
-.RI  "success = \fBsolve_evp_real_1stage\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer, intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer, intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "real*8,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer, intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,  intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "real*8,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer, intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer, intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer, intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer, intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)    \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "logical                \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SS C INTERFACE
-#include "elpa.h"
-
-.br
-.RI "success = \fBsolve_evp_real_1stage\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols);"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "int     \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "int     \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "double *\fBa\fP:             pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "int     \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "double *\fBev\fP:            pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "double *\fBq\fP:             pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "int     \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "int     \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "int     \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "int     \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int     \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "int     \fBsuccess\fP:       return value indicating success (1) or failure (0)
-
-.SH DESCRIPTION
-Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBelpa2_print_kernels\fP(1)
diff -Nru elpa-2016.05.001/man/solve_evp_real_2stage.3 elpa-2019.11.001/man/solve_evp_real_2stage.3
--- elpa-2016.05.001/man/solve_evp_real_2stage.3	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/man/solve_evp_real_2stage.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,93 +0,0 @@
-.TH "solve_evp_real_2stage" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_real_2stage \- solve the real eigenvalue problem with the 2-stage ELPA solver
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-use elpa2
-.br
-.br
-.RI  "success = \fBsolve_evp_real_2stage\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQr=useQR)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer, intent(in)            \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer, intent(in)            \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "real*8,  intent(inout)         \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer, intent(in)            \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,  intent(inout)         \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "real*8,  intent(inout)         \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer, intent(in)            \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer, intent(in)            \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer, intent(in)            \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer, intent(in)            \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)            \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)            \fBmpi_comm_all\fP:  communicator for all processes in the processor set involved in ELPA"
-.br
-.RI "logical, intent(in), optional: \fBuseQR\fP:         optional argument; switches to QR-decomposition if set to .true."
-
-.RI "logical                        \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SS C INTERFACE
-#include "elpa.h"
-
-.br
-.RI "success = \fBsolve_evp_real_2stage\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQr);"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "int     \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "int     \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "double *\fBa\fP:             pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "int     \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "double *\fBev\fP:            pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "double *\fBq\fP:             pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "int     \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "int     \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "int     \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "int     \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int     \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "int     \fBmpi_comm_all\fP:  communicator for all processes in the processor set involved in ELPA"
-.br
-.RI "int     \fBuseQR\fP:         if set to 1 switch to QR-decomposition"
-
-.RI "int     \fBsuccess\fP:       return value indicating success (1) or failure (0)
-
-.SH DESCRIPTION
-Solve the real eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBelpa2_print_kernels\fP(1)
diff -Nru elpa-2016.05.001/man/solve_evp_real.3 elpa-2019.11.001/man/solve_evp_real.3
--- elpa-2016.05.001/man/solve_evp_real.3	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/man/solve_evp_real.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,51 +0,0 @@
-.TH "solve_evp_real" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*-
-.ad l
-.nh
-.SH NAME
-solve_evp_real \- solve the real eigenvalue problem with the 1-stage ELPA solver.
-This is an old and deprecated interface. It is recommendet to use \fBsolve_evp_real_1stage\fP(3)
-.br
-
-.SH SYNOPSIS
-.br
-.SS FORTRAN INTERFACE
-use elpa1
-.br
-.br
-.RI  "success = \fBsolve_evp_real\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)"
-.br
-.RI " "
-.br
-.RI "With the definintions of the input and output variables:"
-
-.br
-.RI "integer, intent(in)    \fBna\fP:            global dimension of quadratic matrix \fBa\fP to solve"
-.br
-.RI "integer, intent(in)    \fBnev\fP:           number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
-.br
-.RI "real*8,  intent(inout) \fBa\fP:             locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
-.br
-.RI "integer, intent(in)    \fBlda\fP:           leading dimension of locally distributed matrix \fBa\fP"
-.br
-.RI "real*8,  intent(inout) \fBev\fP:            on output the first \fBnev\fP computed eigenvalues"
-.br
-.RI "real*8,  intent(inout) \fBq\fP:             on output the first \fBnev\fP computed eigenvectors"
-.br
-.RI "integer, intent(in)    \fBldq\fP:           leading dimension of matrix \fBq\fP which stores the eigenvectors"
-.br
-.RI "integer, intent(in)    \fBnblk\fP:          blocksize of block cyclic distributin, must be the same in both directions"
-.br
-.RI "integer, intent(in)    \fBmatrixCols\fP:    number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
-.br
-.RI "integer, intent(in)    \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-.RI "integer, intent(in)    \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)"
-.br
-
-.RI "logical                \fBsuccess\fP:       return value indicating success or failure"
-.br
-.SH DESCRIPTION
-Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
-.br
-.SH "SEE ALSO"
-\fBget_elpa_communicators\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBelpa2_print_kernels\fP(1)
diff -Nru elpa-2016.05.001/manual_cpp elpa-2019.11.001/manual_cpp
--- elpa-2016.05.001/manual_cpp	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/manual_cpp	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+from __future__ import print_function
+import os
+import sys
+import subprocess
+
+def cpp_arg(arg):
+    return arg.startswith("-I") or \
+           arg.startswith("-D") or \
+           arg.startswith("-U")
+
+
+def check_call(args, **kwargs):
+    if os.getenv("V") == "1":
+        print(" ".join(args))
+    return subprocess.check_call(args, **kwargs)
+
+def check_call_redirect(args, filename=None, **kwargs):
+    if os.getenv("V") == "1":
+        print(" ".join(args), ">", filename)
+    with open(filename, "wb") as fd:
+        try:
+            return subprocess.check_call(args, stdout=fd, **kwargs)
+        except subprocess.CalledProcessError as e:
+            os.remove(filename)
+            raise SystemExit(e.returncode)
+
+args = sys.argv[1:]
+cpp_args = filter(cpp_arg, args)
+
+files = filter(lambda q : q.endswith(".F90"), args)
+args = filter(lambda q : not q.endswith(".F90"), args)
+if len(files) > 1:
+    raise Exception("Specify exactly one .F90 file")
+elif len(files) == 0:
+    # No .F90 file specified, execute program as-is
+    try:
+        os.execvp(args[0], args[0:])
+    except OSError as e:
+        print("Error executing '{0}': {1}".format(args[0], e.args[1]))
+        raise SystemExit(1)
+elif len(files) == 1:
+    file, = files
+
+tmp_filename = "manually_preprocessed_" + file.replace("/", "_")
+
+try:
+    output = args.index("-o")
+    outputname = args[output + 1]
+    tmp_filename += "-" + outputname.replace("/", "_") + ".F90"
+except ValueError:
+    pass
+
+tmp_filename = tmp_filename[-250:]
+
+# preprocess
+check_call_redirect(["cpp","-P", "-traditional", "-Wall", "-Werror"] + cpp_args + [file], filename=tmp_filename)
+
+# compile
+check_call(args + [tmp_filename])
+
+# cleanup
+os.remove(tmp_filename)
diff -Nru elpa-2016.05.001/missing elpa-2019.11.001/missing
--- elpa-2016.05.001/missing	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/missing	2019-12-21 16:29:46.000000000 +0000
@@ -1,9 +1,9 @@
 #! /bin/sh
 # Common wrapper for a few potentially missing GNU programs.
 
-scriptversion=2013-10-28.13; # UTC
+scriptversion=2018-03-07.03; # UTC
 
-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 # Originally written by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
 
 # This program is free software; you can redistribute it and/or modify
@@ -17,7 +17,7 @@
 # GNU General Public License for more details.
 
 # You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -101,9 +101,9 @@
   exit $st
 fi
 
-perl_URL=http://www.perl.org/
-flex_URL=http://flex.sourceforge.net/
-gnu_software_URL=http://www.gnu.org/software
+perl_URL=https://www.perl.org/
+flex_URL=https://github.com/westes/flex
+gnu_software_URL=https://www.gnu.org/software
 
 program_details ()
 {
@@ -207,9 +207,9 @@
 exit $st
 
 # Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "scriptversion="
 # time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
+# time-stamp-time-zone: "UTC0"
 # time-stamp-end: "; # UTC"
 # End:
diff -Nru elpa-2016.05.001/nvcc_wrap elpa-2019.11.001/nvcc_wrap
--- elpa-2016.05.001/nvcc_wrap	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/nvcc_wrap	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+$NVCC `echo $@ | sed 's/-fPIC/-Xcompiler -fPIC/; s/-Wl/-Xlinker -Wl/; '`
diff -Nru elpa-2016.05.001/py-compile elpa-2019.11.001/py-compile
--- elpa-2016.05.001/py-compile	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/py-compile	2019-12-21 16:29:48.000000000 +0000
@@ -0,0 +1,170 @@
+#!/bin/sh
+# py-compile - Compile a Python program
+
+scriptversion=2018-03-07.03; # UTC
+
+# Copyright (C) 2000-2018 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+if [ -z "$PYTHON" ]; then
+  PYTHON=python
+fi
+
+me=py-compile
+
+usage_error ()
+{
+  echo "$me: $*" >&2
+  echo "Try '$me --help' for more information." >&2
+  exit 1
+}
+
+basedir=
+destdir=
+while test $# -ne 0; do
+  case "$1" in
+    --basedir)
+      if test $# -lt 2; then
+        usage_error "option '--basedir' requires an argument"
+      else
+        basedir=$2
+      fi
+      shift
+      ;;
+    --destdir)
+      if test $# -lt 2; then
+        usage_error "option '--destdir' requires an argument"
+      else
+        destdir=$2
+      fi
+      shift
+      ;;
+    -h|--help)
+      cat <<\EOF
+Usage: py-compile [--help] [--version] [--basedir DIR] [--destdir DIR] FILES..."
+
+Byte compile some python scripts FILES.  Use --destdir to specify any
+leading directory path to the FILES that you don't want to include in the
+byte compiled file.  Specify --basedir for any additional path information you
+do want to be shown in the byte compiled file.
+
+Example:
+  py-compile --destdir /tmp/pkg-root --basedir /usr/share/test test.py test2.py
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+      exit $?
+      ;;
+    -v|--version)
+      echo "$me $scriptversion"
+      exit $?
+      ;;
+    --)
+      shift
+      break
+      ;;
+    -*)
+      usage_error "unrecognized option '$1'"
+      ;;
+    *)
+      break
+      ;;
+  esac
+  shift
+done
+
+files=$*
+if test -z "$files"; then
+    usage_error "no files given"
+fi
+
+# if basedir was given, then it should be prepended to filenames before
+# byte compilation.
+if [ -z "$basedir" ]; then
+    pathtrans="path = file"
+else
+    pathtrans="path = os.path.join('$basedir', file)"
+fi
+
+# if destdir was given, then it needs to be prepended to the filename to
+# byte compile but not go into the compiled file.
+if [ -z "$destdir" ]; then
+    filetrans="filepath = path"
+else
+    filetrans="filepath = os.path.normpath('$destdir' + os.sep + path)"
+fi
+
+$PYTHON -c "
+import sys, os, py_compile, imp
+
+files = '''$files'''
+
+sys.stdout.write('Byte-compiling python modules...\n')
+for file in files.split():
+    $pathtrans
+    $filetrans
+    if not os.path.exists(filepath) or not (len(filepath) >= 3
+                                            and filepath[-3:] == '.py'):
+	    continue
+    sys.stdout.write(file)
+    sys.stdout.flush()
+    if hasattr(imp, 'get_tag'):
+        py_compile.compile(filepath, imp.cache_from_source(filepath), path)
+    else:
+        py_compile.compile(filepath, filepath + 'c', path)
+sys.stdout.write('\n')" || exit $?
+
+# this will fail for python < 1.5, but that doesn't matter ...
+$PYTHON -O -c "
+import sys, os, py_compile, imp
+
+# pypy does not use .pyo optimization
+if hasattr(sys, 'pypy_translation_info'):
+    sys.exit(0)
+
+files = '''$files'''
+sys.stdout.write('Byte-compiling python modules (optimized versions) ...\n')
+for file in files.split():
+    $pathtrans
+    $filetrans
+    if not os.path.exists(filepath) or not (len(filepath) >= 3
+                                            and filepath[-3:] == '.py'):
+	    continue
+    sys.stdout.write(file)
+    sys.stdout.flush()
+    if hasattr(imp, 'get_tag'):
+        py_compile.compile(filepath, imp.cache_from_source(filepath, False), path)
+    else:
+        py_compile.compile(filepath, filepath + 'o', path)
+sys.stdout.write('\n')" 2>/dev/null || :
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'before-save-hook 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC0"
+# time-stamp-end: "; # UTC"
+# End:
diff -Nru elpa-2016.05.001/python/pyelpa/distributedmatrix.py elpa-2019.11.001/python/pyelpa/distributedmatrix.py
--- elpa-2016.05.001/python/pyelpa/distributedmatrix.py	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/python/pyelpa/distributedmatrix.py	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,399 @@
+"""distributedmatrix.py -- classes for distributed matrices
+
+This file contains the python classes to use with the wrapper.
+"""
+import numpy as np
+from functools import wraps
+from .wrapper import Elpa
+
+class ProcessorLayout:
+    """Create rectangular processor layout for use with distributed matrices"""
+    def __init__(self, comm):
+        """Initialize processor layout.
+
+        Args:
+            comm: MPI communicator from mpi4py
+        """
+        nprocs = comm.Get_size()
+        rank = comm.Get_rank()
+        for np_cols in range(int(np.sqrt(nprocs)), 0, -1):
+            if nprocs % np_cols == 0:
+              break
+        #if nprocs == 1:
+        #    np_cols = 1
+        np_rows = nprocs//np_cols
+        # column major distribution of processors
+        my_pcol = rank // np_rows
+        my_prow = rank % np_rows
+        self.np_cols, self.np_rows = np_cols, np_rows
+        self.my_pcol, self.my_prow = my_pcol, my_prow
+        self.comm = comm
+        self.comm_f = comm.py2f()
+
+
+class DistributedMatrix:
+    """Class for generating a distributed block-cyclic matrix
+
+    The data attribute contains the array in the correct size for the local
+    processor.
+    """
+    def __init__(self, processor_layout, na, nev, nblk, dtype=np.float64):
+        """Initialize distributed matrix for a given processor layout.
+
+        Args:
+            processor_layout (ProcessorLayout): has to be created from MPI
+                communicator
+            na (int): dimension of matrix
+            nev (int): number of eigenvectors/eigenvalues to be computed
+            nblk (int): block size of distributed matrix
+            dtype: data type of matrix
+        """
+        self.na = na
+        self.nev = nev
+        self.nblk = nblk
+        self.processor_layout = processor_layout
+
+        # get local size
+        self.na_rows = self.numroc(na, nblk, processor_layout.my_prow, 0,
+                                   processor_layout.np_rows)
+        self.na_cols = self.numroc(na, nblk, processor_layout.my_pcol, 0,
+                                   processor_layout.np_cols)
+        # create array
+        self.data = np.empty((self.na_rows, self.na_cols),
+                             dtype=dtype, order='F')
+
+        self.elpa = None
+
+    @classmethod
+    def from_communicator(cls, comm, na, nev, nblk, dtype=np.float64):
+        """Initialize distributed matrix from a MPI communicator.
+
+        Args:
+            comm: MPI communicator from mpi4py
+            na (int): dimension of matrix
+            nev (int): number of eigenvectors/eigenvalues to be computed
+            nblk (int): block size of distributed matrix
+            dtype: data type of matrix
+        """
+        processor_layout = ProcessorLayout(comm)
+        return cls(processor_layout, na, nev, nblk, dtype)
+
+    @classmethod
+    def from_comm_world(cls, na, nev, nblk, dtype=np.float64):
+        """Initialize distributed matrix from the MPI_COMM_WORLD communicator.
+
+        Args:
+            na (int): dimension of matrix
+            nev (int): number of eigenvectors/eigenvalues to be computed
+            nblk (int): block size of distributed matrix
+            dtype: data type of matrix
+        """
+        from mpi4py import MPI
+        comm = MPI.COMM_WORLD
+        processor_layout = ProcessorLayout(comm)
+        return cls(processor_layout, na, nev, nblk, dtype)
+
+    @classmethod
+    def like(cls, matrix):
+        """Get a DistributedMatrix with the same parameters as matrix"""
+        return cls(matrix.processor_layout, matrix.na, matrix.nev, matrix.nblk,
+                   matrix.data.dtype)
+
+    def get_local_index(self, global_row, global_col):
+        """compute local row and column indices from global ones
+
+        Returns a tuple of the local row and column indices
+        """
+        local_row = self.indxg2l(global_row, self.nblk,
+                                 self.processor_layout.my_prow, 0,
+                                 self.processor_layout.np_rows)
+        local_col = self.indxg2l(global_col, self.nblk,
+                                 self.processor_layout.my_pcol, 0,
+                                 self.processor_layout.np_cols)
+        return local_row, local_col
+
+    def get_global_index(self, local_row, local_col):
+        """compute global row and column indices from local ones
+
+        Returns a tuple of the global row and column indices
+        """
+        global_row = self.indxl2g(local_row, self.nblk,
+                                  self.processor_layout.my_prow, 0,
+                                  self.processor_layout.np_rows)
+        global_col = self.indxl2g(local_col, self.nblk,
+                                  self.processor_layout.my_pcol, 0,
+                                  self.processor_layout.np_cols)
+        return global_row, global_col
+
+    def is_local_index(self, global_row, global_col):
+        """check if global index is stored on current processor"""
+        return self.is_local_row(global_row) and self.is_local_col(global_col)
+
+    def is_local_row(self, global_row):
+        """check if global row is stored on this processor"""
+        process_row = self.indxg2p(global_row, self.nblk,
+                                   self.processor_layout.my_prow, 0,
+                                   self.processor_layout.np_rows)
+        return process_row == self.processor_layout.my_prow
+
+    def is_local_col(self, global_col):
+        process_col = self.indxg2p(global_col, self.nblk,
+                                   self.processor_layout.my_pcol, 0,
+                                   self.processor_layout.np_cols)
+        return process_col == self.processor_layout.my_pcol
+
+    @staticmethod
+    def indxg2l(indxglob, nb, iproc, isrcproc, nprocs):
+        """compute local index from global index indxglob
+
+        original netlib scalapack source:
+
+        .. code-block:: fortran
+
+            INDXG2L = NB*((INDXGLOB-1)/(NB*NPROCS))+MOD(INDXGLOB-1,NB)+1
+        """
+        # adapt to python 0-based indexing
+        return nb*(indxglob//(nb*nprocs)) + indxglob%nb
+
+    @staticmethod
+    def indxl2g(indxloc, nb, iproc, isrcproc, nprocs):
+        """compute global index from local index indxloc
+
+        original netlib scalapack source:
+
+        .. code-block:: fortran
+
+            INDXL2G = NPROCS*NB*((INDXLOC-1)/NB) + MOD(INDXLOC-1,NB) +
+                      MOD(NPROCS+IPROC-ISRCPROC, NPROCS)*NB + 1
+        """
+        # adapt to python 0-based indexing
+        return nprocs*nb*(indxloc//nb) + indxloc%nb + \
+            ((nprocs+iproc-isrcproc)%nprocs)*nb
+
+    @staticmethod
+    def indxg2p(indxglob, nb, iproc, isrcproc, nprocs):
+        """compute process coordinate for global index
+
+        original netlib scalapack source:
+
+        .. code-block:: fortran
+
+            INDXG2P = MOD( ISRCPROC + (INDXGLOB - 1) / NB, NPROCS )
+        """
+        # adapt to python 0-based indexing
+        return (isrcproc + indxglob // nb) % nprocs
+
+    @staticmethod
+    def numroc(n, nb, iproc, isrcproc, nprocs):
+        """Get local dimensions of distributed block-cyclic matrix.
+
+        Programmed after scalapack source (tools/numroc.f on netlib).
+        """
+        mydist = (nprocs + iproc - isrcproc) % nprocs
+        nblocks = n // nb
+        result = (nblocks // nprocs) * nb
+        extrablks = nblocks % nprocs
+        if mydist < extrablks:
+            result += nb
+        elif mydist == extrablks:
+            result += n % nb
+        return int(result)
+
+    def _initialized_elpa(function):
+        # wrapper to ensure one-time initialization of Elpa object
+        @wraps(function)
+        def wrapped_function(self):
+            if self.elpa is None:
+                self.elpa = Elpa.from_distributed_matrix(self)
+            return function(self)
+        return wrapped_function
+
+    @_initialized_elpa
+    def compute_eigenvectors(self):
+        """Compute eigenvalues and eigenvectors
+
+        The eigenvectors are stored in columns.
+        This function returns a dictionary with entries 'eigenvalues' and
+        'eigenvectors'.
+
+        After computing the eigenvectors, the original content of the matrix is
+        lost.
+        """
+        eigenvectors = DistributedMatrix.like(self)
+        eigenvalues = np.zeros(self.na, dtype=np.float64)
+        # call ELPA
+        self.elpa.eigenvectors(self.data, eigenvalues, eigenvectors.data)
+        return {'eigenvalues': eigenvalues, 'eigenvectors': eigenvectors}
+
+    @_initialized_elpa
+    def compute_eigenvalues(self):
+        """Compute only the eigenvalues.
+
+        This function returns the eigenvalues as an array.
+
+        After computing the eigenvalues, the original content of the matrix is
+        lost.
+        """
+        eigenvalues = np.zeros(self.na, dtype=np.float64)
+        # call ELPA
+        self.elpa.eigenvalues(self.data, eigenvalues)
+        return eigenvalues
+
+    def set_data_from_global_matrix(self, matrix):
+        """Set local part of the global matrix"""
+        for local_row in range(self.na_rows):
+            for local_col in range(self.na_cols):
+                global_row, global_col = self.get_global_index(local_row,
+                                                               local_col)
+                self.data[local_row, local_col] = matrix[global_row,
+                                                         global_col]
+
+    def dot(self, vector):
+        """Compute dot product of matrix with vector.
+
+        This blocked implementation is much faster than the naive
+        implementation.
+        """
+        if len(vector.shape) > 1 or vector.shape[0] != self.na:
+            raise ValueError("Error: shape of vector {} incompatible to "
+                             "matrix of size {:d}x{:d}.".format(
+                                 vector.shape, self.na, self.na))
+        from mpi4py import MPI
+        summation = np.zeros_like(vector)
+        # loop only over blocks here
+        for local_row in range(0, self.na_rows, self.nblk):
+            for local_col in range(0, self.na_cols, self.nblk):
+                # do not go beyond the end of the matrix
+                row_block_size = min(local_row + self.nblk,
+                                     self.na_rows) - local_row
+                col_block_size = min(local_col + self.nblk,
+                                     self.na_cols) - local_col
+                global_row, global_col = self.get_global_index(local_row,
+                                                               local_col)
+                # use numpy for faster dot product of local block
+                summation[global_row:global_row+row_block_size] += \
+                    np.dot(self.data[local_row:local_row + row_block_size,
+                                     local_col:local_col + col_block_size],
+                           vector[global_col:global_col+col_block_size])
+        result = np.zeros_like(vector)
+        self.processor_layout.comm.Allreduce(summation, result, op=MPI.SUM)
+        return result
+
+    def _dot_naive(self, vector):
+        """Compute naive dot product of matrix with vector.
+
+        Still in here as an example and for testing purposes.
+        """
+        from mpi4py import MPI
+        summation = np.zeros_like(vector)
+        for local_row in range(self.na_rows):
+            for local_col in range(self.na_cols):
+                global_row, global_col = self.get_global_index(local_row,
+                                                               local_col)
+                summation[global_row] += self.data[local_row, local_col] *\
+                    vector[global_col]
+        result = np.zeros_like(vector)
+        self.processor_layout.comm.Allreduce(summation, result, op=MPI.SUM)
+        return result
+
+    def get_column(self, global_col):
+        """Return global column"""
+        from mpi4py import MPI
+        column = np.zeros(self.na, dtype=self.data.dtype)
+        temporary = np.zeros_like(column)
+        if self.is_local_col(global_col):
+            for global_row in range(self.na):
+                if not self.is_local_row(global_row):
+                    continue
+                local_row, local_col = self.get_local_index(global_row,
+                                                            global_col)
+                temporary[global_row] = self.data[local_row, local_col]
+        # this could be done more efficiently with a gather
+        self.processor_layout.comm.Allreduce(temporary, column, op=MPI.SUM)
+        return column
+
+    def get_row(self, global_row):
+        """Return global row"""
+        from mpi4py import MPI
+        row = np.zeros(self.na, dtype=self.data.dtype)
+        temporary = np.zeros_like(row)
+        if self.is_local_row(global_row):
+            for global_col in range(self.na):
+                if not self.is_local_col(global_col):
+                    continue
+                local_row, local_col = self.get_local_index(global_row,
+                                                            global_col)
+                temporary[global_col] = self.data[local_row, local_col]
+        # this could be done more efficiently with a gather
+        self.processor_layout.comm.Allreduce(temporary, row, op=MPI.SUM)
+        return row
+
+    def global_indices(self):
+        """Return iterator over global indices of matrix.
+
+        Use together with set_data_global_index and get_data_global_index.
+        """
+        for local_row in range(self.na_rows):
+            for local_col in range(self.na_cols):
+                yield self.get_global_index(local_row, local_col)
+
+    def set_data_for_global_index(self, global_row, global_col, value):
+        """Set value of matrix at global coordinates"""
+        if self.is_local_index(global_row, global_col):
+            local_row, local_col = self.get_local_index(global_row, global_col)
+            self.data[local_row, local_col] = value
+
+    def get_data_for_global_index(self, global_row, global_col):
+        """Get value of matrix at global coordinates"""
+        if self.is_local_index(global_row, global_col):
+            local_row, local_col = self.get_local_index(global_row, global_col)
+            return self.data[local_row, local_col]
+        else:
+            raise ValueError('Index out of bounds: global row {:d}, '
+                             'global col {:d}'.format(global_row, global_col))
+
+    def global_block_indices(self):
+        """Return iterator over global indices of matrix blocks.
+
+        Use together with set_block_global_index and get_block_global_index
+        for more efficient loops.
+        """
+        for local_row in range(0, self.na_rows, self.nblk):
+            for local_col in range(0, self.na_cols, self.nblk):
+                # do not go beyond the end of the matrix
+                row_block_size = min(local_row + self.nblk,
+                                     self.na_rows) - local_row
+                col_block_size = min(local_col + self.nblk,
+                                     self.na_cols) - local_col
+                global_row, global_col = self.get_global_index(local_row,
+                                                               local_col)
+                yield global_row, global_col, row_block_size, col_block_size
+
+    def set_block_for_global_index(self, global_row, global_col,
+                                   row_block_size, col_block_size, value):
+        """Set value of block of matrix at global coordinates"""
+        if self.is_local_index(global_row, global_col):
+            local_row, local_col = self.get_local_index(global_row, global_col)
+            if value.shape != (row_block_size, col_block_size):
+                raise ValueError("value has the wrong shape. "
+                                 "Expected: {}, found: {}."
+                                 .format((row_block_size, col_block_size),
+                                         value.shape)
+                                 )
+            self.data[local_row:local_row+row_block_size,
+                      local_col:local_col+col_block_size] = value
+
+    def get_block_for_global_index(self, global_row, global_col,
+                                   row_block_size, col_block_size):
+        """Get value of block of matrix at global coordinates"""
+        if self.is_local_index(global_row, global_col):
+            local_row, local_col = self.get_local_index(global_row, global_col)
+            if local_row+row_block_size > self.na_rows or \
+                    local_col+col_block_size > self.na_cols:
+                raise ValueError("Block size wrong: exceeds dimensions of"
+                                 " matrix.")
+            return self.data[local_row:local_row+row_block_size,
+                             local_col:local_col+col_block_size]
+        else:
+            raise ValueError('Index out of bounds: global row {:d}, '
+                             'global col {:d}'.format(global_row, global_col))
diff -Nru elpa-2016.05.001/python/pyelpa/__init__.py elpa-2019.11.001/python/pyelpa/__init__.py
--- elpa-2016.05.001/python/pyelpa/__init__.py	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/python/pyelpa/__init__.py	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,79 @@
+"""pyelpa -- python wrapper for ELPA
+
+This wrapper uses cython to wrap the C API of ELPA (Eigenvalue SoLvers for
+Petaflop-Applications) so that it can be called from python.
+
+Examples:
+
+1. Use the Elpa object to access the eigenvectors/eigenvalues wrapper:
+
+>>> import numpy as np
+... from pyelpa import ProcessorLayout, DistributedMatrix, Elpa
+... from mpi4py import MPI
+... import sys
+... 
+... # set some parameters for matrix layout
+... na = 1000
+... nev = 200
+... nblk = 16
+... 
+... # initialize processor layout, needed for calling ELPA
+... comm = MPI.COMM_WORLD
+... layout_p = ProcessorLayout(comm)
+... 
+... # create arrays
+... a = DistributedMatrix(layout_p, na, nev, nblk)
+... eigenvectors = DistributedMatrix(layout_p, na, nev, nblk)
+... eigenvalues = np.zeros(na, dtype=np.float64)
+... 
+... # initialize elpa
+... e = Elpa.from_distributed_matrix(a)
+... 
+... # set input matrix (a.data) on this core (a is stored in a block-cyclic
+... # distributed layout; local size: a.na_rows x a.na_cols)
+... # Caution: using this, the global matrix will not be symmetric; this is just
+... # and example to show how to access the data
+... a.data[:, :] = np.random.rand(a.na_rows, a.na_cols).astype(np.float64)
+... 
+... # now compute nev of na eigenvectors and eigenvalues
+... e.eigenvectors(a.data, eigenvalues, eigenvectors.data)
+... 
+... # now eigenvectors.data contains the local part of the eigenvector matrix
+... # which is stored in a block-cyclic distributed layout
+... 
+... # now eigenvalues contains all computed eigenvalues on all cores
+... 
+... # now compute nev of na eigenvalues
+... e.eigenvalues(a.data, eigenvalues)
+... 
+... # now eigenvalues contains all computed eigenvalues on all cores
+
+
+2. Use the functions provided by the DistributedMatrix object:
+
+>>> import numpy as np
+... from pyelpa import DistributedMatrix
+... 
+... # set some parameters for matrix layout
+... na = 1000
+... nev = 200
+... nblk = 16
+... 
+... a = DistributedMatrix.from_comm_world(na, nev, nblk)
+... # use a diagonal matrix as input
+... matrix = np.diagflat(np.arange(na)**2)
+... # set from global matrix
+... a.set_data_from_global_matrix(matrix)
+... 
+... data = a.compute_eigenvectors()
+... eigenvalues = data['eigenvalues']
+... eigenvectors = data['eigenvectors']
+... # now eigenvectors.data contains the local part of the eigenvector matrix
+... # which is stored in a block-cyclic distributed layout
+... 
+... # now eigenvalues contains all computed eigenvalues on all cores
+"""
+from .wrapper import Elpa
+from .distributedmatrix import ProcessorLayout, DistributedMatrix
+
+__all__ = ['ProcessorLayout', 'DistributedMatrix', 'Elpa']
diff -Nru elpa-2016.05.001/python/pyelpa/wrapper.pyx elpa-2019.11.001/python/pyelpa/wrapper.pyx
--- elpa-2016.05.001/python/pyelpa/wrapper.pyx	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/python/pyelpa/wrapper.pyx	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,249 @@
+"""wrapper.pyx -- python wrapper for ELPA
+
+This file contains the cython part of the wrapper.
+"""
+cimport numpy as np
+import numpy as np
+import sys
+
+if 'mpi4py.MPI' in sys.modules.keys():
+    raise NotImplementedError('Please load the pyelpa module before mpi4py, '
+                              'otherwise there will be MPI problems.')
+
+# import the function definitions from the ELPA header
+cdef import from "<elpa/elpa.h>":
+    cdef struct elpa_struct:
+        pass
+    ctypedef elpa_struct *elpa_t
+    int elpa_init(int api_version)
+    void elpa_uninit(int *error)
+    elpa_t elpa_allocate(int *error)
+    void elpa_deallocate(elpa_t handle, int *error)
+    int elpa_setup(elpa_t handle)
+    void elpa_set_integer(elpa_t handle, const char *name, int value, int *error)
+    void elpa_get_integer(elpa_t handle, const char *name, int *value, int *error)
+    void elpa_set_double(elpa_t handle, const char *name, double value, int *error)
+    void elpa_get_double(elpa_t handle, const char *name, double *value, int *error)
+    void elpa_eigenvectors_d(elpa_t handle, double *a, double *ev, double *q, int *error)
+    void elpa_eigenvectors_f(elpa_t handle, float *a, float *ev, float *q, int *error)
+    void elpa_eigenvectors_dc(elpa_t handle, double complex *a, double *ev, double complex *q, int *error)
+    void elpa_eigenvectors_fc(elpa_t handle, float complex *a, float *ev, float complex *q, int *error)
+    void elpa_eigenvalues_d(elpa_t handle, double *a, double *ev, int *error)
+    void elpa_eigenvalues_f(elpa_t handle, float *a, float *ev, int *error)
+    void elpa_eigenvalues_dc(elpa_t handle, double complex *a, double *ev, int *error)
+    void elpa_eigenvalues_fc(elpa_t handle, float complex *a, float *ev, int *error)
+    int ELPA_OK
+    int ELPA_SOLVER_2STAGE
+
+
+cdef class Elpa:
+    """Wrapper for ELPA C interface.
+
+    Provides routines for initialization, deinitialization, setting and getting
+    properties and for calling the eigenvectors and eigenvalues routines.
+    The routines eigenvectors and eigenvalues select the right ELPA routine to
+    call depending on the argument type.
+    """
+    cdef elpa_t handle
+
+    def __init__(self):
+        """Run initialization and allocation of handle"""
+        if elpa_init(20171201) != ELPA_OK:
+            raise RuntimeError("ELPA API version not supported")
+        cdef int error
+        handle = elpa_allocate(&error)
+        self.handle = handle
+
+    def set_integer(self, description, int value):
+        """Wraps elpa_set_integer"""
+        cdef int error
+        if isinstance(description, unicode):
+            # encode to ascii for passing to C
+            description = (<unicode>description).encode('ascii')
+        cdef const char* c_string = description
+        elpa_set_integer(<elpa_t>self.handle, description, value, &error)
+
+    def get_integer(self, description):
+        """Wraps elpa_get_integer"""
+        cdef int error
+        if isinstance(description, unicode):
+            # encode to ascii for passing to C
+            description = (<unicode>description).encode('ascii')
+        cdef const char* c_string = description
+        cdef int tmp
+        elpa_get_integer(<elpa_t>self.handle, c_string, &tmp, &error)
+        return tmp
+
+    def set_double(self, description, double value):
+        """Wraps elpa_set_double"""
+        cdef int error
+        if isinstance(description, unicode):
+            # encode to ascii for passing to C
+            description = (<unicode>description).encode('ascii')
+        cdef const char* c_string = description
+        elpa_set_double(<elpa_t>self.handle, description, value, &error)
+
+    def get_double(self, description):
+        """Wraps elpa_get_double"""
+        cdef int error
+        if isinstance(description, unicode):
+            # encode to ascii for passing to C
+            description = (<unicode>description).encode('ascii')
+        cdef const char* c_string = description
+        cdef double tmp
+        elpa_get_double(<elpa_t>self.handle, c_string, &tmp, &error)
+        return tmp
+
+    def setup(self):
+        """call setup function"""
+        elpa_setup(<elpa_t>self.handle)
+
+    def __del__(self):
+        """Deallocation of handle and deinitialization"""
+        cdef int error
+        elpa_deallocate(<elpa_t>self.handle, &error)
+        elpa_uninit(&error)
+
+    def eigenvectors_d(self,
+                       np.ndarray[np.float64_t, ndim=2] a,
+                       np.ndarray[np.float64_t, ndim=1] ev,
+                       np.ndarray[np.float64_t, ndim=2] q):
+        cdef int error
+        elpa_eigenvectors_d(<elpa_t>self.handle, <np.float64_t *>a.data,
+                            <np.float64_t *>ev.data, <np.float64_t *>q.data,
+                            <int*>&error)
+        if error != ELPA_OK:
+            raise RuntimeError("ELPA returned error value {:d}.".format(error))
+
+    def eigenvectors_f(self,
+                       np.ndarray[np.float32_t, ndim=2] a,
+                       np.ndarray[np.float32_t, ndim=1] ev,
+                       np.ndarray[np.float32_t, ndim=2] q):
+        cdef int error
+        elpa_eigenvectors_f(<elpa_t>self.handle, <np.float32_t *>a.data,
+                            <np.float32_t *>ev.data, <np.float32_t *>q.data,
+                            <int*>&error)
+        if error != ELPA_OK:
+            raise RuntimeError("ELPA returned error value {:d}.".format(error))
+
+    def eigenvectors_dc(self,
+                        np.ndarray[np.complex128_t, ndim=2] a,
+                        np.ndarray[np.float64_t, ndim=1] ev,
+                        np.ndarray[np.complex128_t, ndim=2] q):
+        cdef int error
+        elpa_eigenvectors_dc(<elpa_t>self.handle, <np.complex128_t *>a.data,
+                             <np.float64_t *>ev.data, <np.complex128_t *>q.data,
+                             <int*>&error)
+        if error != ELPA_OK:
+            raise RuntimeError("ELPA returned error value {:d}.".format(error))
+
+    def eigenvectors_fc(self,
+                        np.ndarray[np.complex64_t, ndim=2] a,
+                        np.ndarray[np.float32_t, ndim=1] ev,
+                        np.ndarray[np.complex64_t, ndim=2] q):
+        cdef int error
+        elpa_eigenvectors_fc(<elpa_t>self.handle, <np.complex64_t *>a.data,
+                             <np.float32_t *>ev.data, <np.complex64_t *>q.data,
+                             <int*>&error)
+        if error != ELPA_OK:
+            raise RuntimeError("ELPA returned error value {:d}.".format(error))
+
+    def eigenvectors(self, a, ev, q):
+        """Compute eigenvalues and eigenvectors.
+
+        The data type of a is tested and the corresponding ELPA routine called
+
+        Args:
+            a (DistributedMatrix): problem matrix
+            ev (numpy.ndarray): array of size a.na to store eigenvalues
+            q (DistributedMatrix): store eigenvectors
+        """
+        if a.dtype == np.float64:
+            self.eigenvectors_d(a, ev, q)
+        elif a.dtype == np.float32:
+            self.eigenvectors_f(a, ev, q)
+        elif a.dtype == np.complex128:
+            self.eigenvectors_dc(a, ev, q)
+        elif a.dtype == np.complex64:
+            self.eigenvectors_fc(a, ev, q)
+        else:
+            raise TypeError("Type not known.")
+
+    def eigenvalues_d(self,
+                       np.ndarray[np.float64_t, ndim=2] a,
+                       np.ndarray[np.float64_t, ndim=1] ev):
+        cdef int error
+        elpa_eigenvalues_d(<elpa_t>self.handle, <np.float64_t *>a.data,
+                           <np.float64_t *>ev.data, <int*>&error)
+        if error != ELPA_OK:
+            raise RuntimeError("ELPA returned error value {:d}.".format(error))
+
+    def eigenvalues_f(self,
+                       np.ndarray[np.float32_t, ndim=2] a,
+                       np.ndarray[np.float32_t, ndim=1] ev):
+        cdef int error
+        elpa_eigenvalues_f(<elpa_t>self.handle, <np.float32_t *>a.data,
+                           <np.float32_t *>ev.data, <int*>&error)
+        if error != ELPA_OK:
+            raise RuntimeError("ELPA returned error value {:d}.".format(error))
+
+    def eigenvalues_dc(self,
+                        np.ndarray[np.complex128_t, ndim=2] a,
+                        np.ndarray[np.float64_t, ndim=1] ev):
+        cdef int error
+        elpa_eigenvalues_dc(<elpa_t>self.handle, <np.complex128_t *>a.data,
+                            <np.float64_t *>ev.data, <int*>&error)
+        if error != ELPA_OK:
+            raise RuntimeError("ELPA returned error value {:d}.".format(error))
+
+    def eigenvalues_fc(self,
+                        np.ndarray[np.complex64_t, ndim=2] a,
+                        np.ndarray[np.float32_t, ndim=1] ev):
+        cdef int error
+        elpa_eigenvalues_fc(<elpa_t>self.handle, <np.complex64_t *>a.data,
+                            <np.float32_t *>ev.data, <int*>&error)
+        if error != ELPA_OK:
+            raise RuntimeError("ELPA returned error value {:d}.".format(error))
+
+    def eigenvalues(self, a, ev):
+        """Compute eigenvalues.
+
+        The data type of a is tested and the corresponding ELPA routine called
+
+        Args:
+            a (DistributedMatrix): problem matrix
+            ev (numpy.ndarray): array of size a.na to store eigenvalues
+        """
+        if a.dtype == np.float64:
+            self.eigenvalues_d(a, ev)
+        elif a.dtype == np.float32:
+            self.eigenvalues_f(a, ev)
+        elif a.dtype == np.complex128:
+            self.eigenvalues_dc(a, ev)
+        elif a.dtype == np.complex64:
+            self.eigenvalues_fc(a, ev)
+        else:
+            raise TypeError("Type not known.")
+
+    @classmethod
+    def from_distributed_matrix(cls, a):
+        """Initialize ELPA with values from a distributed matrix
+
+        Args:
+            a (DistributedMatrix): matrix to get values from
+        """
+        self = cls()
+        # Set parameters the matrix and it's MPI distribution
+        self.set_integer("mpi_comm_parent", <int>a.processor_layout.comm_f)
+        self.set_integer("na", <int>a.na)
+        self.set_integer("nev", <int>a.nev)
+        self.set_integer("local_nrows", <int>a.na_rows)
+        self.set_integer("local_ncols", <int>a.na_cols)
+        self.set_integer("nblk", <int>a.nblk)
+        self.set_integer("process_row", <int>a.processor_layout.my_prow)
+        self.set_integer("process_col", <int>a.processor_layout.my_pcol)
+        # Setup
+        self.setup()
+        # if desired, set tunable run-time options
+        self.set_integer("solver", ELPA_SOLVER_2STAGE)
+        return self
diff -Nru elpa-2016.05.001/python/tests/test_elpa_import.py elpa-2019.11.001/python/tests/test_elpa_import.py
--- elpa-2016.05.001/python/tests/test_elpa_import.py	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/python/tests/test_elpa_import.py	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,6 @@
+def test_pyelpa_import():
+    import pyelpa
+
+def test_elpa_supported_api_version():
+    from pyelpa import Elpa
+    e = Elpa()
diff -Nru elpa-2016.05.001/python/tests/test_mpi4py.py elpa-2019.11.001/python/tests/test_mpi4py.py
--- elpa-2016.05.001/python/tests/test_mpi4py.py	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/python/tests/test_mpi4py.py	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,9 @@
+def test_import():
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+
+
+def test_barrier():
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    comm.Barrier()
diff -Nru elpa-2016.05.001/python/tests/test_numroc.py elpa-2019.11.001/python/tests/test_numroc.py
--- elpa-2016.05.001/python/tests/test_numroc.py	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/python/tests/test_numroc.py	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,7 @@
+def test_numroc():
+    from pyelpa import DistributedMatrix
+    n = 100
+    nb = 16
+    assert(DistributedMatrix.numroc(n, nb, 0, 0, 3) == 36)
+    assert(DistributedMatrix.numroc(n, nb, 1, 0, 3) == 32)
+    assert(DistributedMatrix.numroc(n, nb, 1, 1, 3) == 36)
diff -Nru elpa-2016.05.001/python/tests/test_with_mpi.py elpa-2019.11.001/python/tests/test_with_mpi.py
--- elpa-2016.05.001/python/tests/test_with_mpi.py	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/python/tests/test_with_mpi.py	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,438 @@
+import pytest
+
+# combinations of na, nev, nblk to run all the tests with
+parameter_list = [
+    (200, 20, 16),
+    (200, 200, 16),
+    (200, 20, 64),
+    (200, 200, 64),
+    (200, 20, 4),
+    (200, 200, 4),
+    (50, 20, 16),
+    (100, 20, 16),
+]
+
+def get_random_vector(size):
+    """generate random vector with given size that is equal on all cores"""
+    import numpy as np
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    am_i_root = comm.Get_rank() == 0
+    vector = np.empty(size)
+    if am_i_root:
+        vector[:] = np.random.rand(size)
+    comm.Bcast(vector)
+    return vector
+
+
+def test_processor_layout():
+    from pyelpa import ProcessorLayout
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+    assert(comm.Get_size() == layout_p.np_cols*layout_p.np_rows)
+    assert(layout_p.my_prow >= 0)
+    assert(layout_p.my_pcol >= 0)
+    assert(layout_p.my_prow <= comm.Get_size())
+    assert(layout_p.my_pcol <= comm.Get_size())
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_distributed_matrix_from_processor_layout(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.float32, np.complex64, np.complex128]:
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        assert(a.data.dtype == dtype)
+        assert(a.data.shape == (a.na_rows, a.na_cols))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_distributed_matrix_from_communicator(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+
+    for dtype in [np.float64, np.float32, np.complex64, np.complex128]:
+        a = DistributedMatrix.from_communicator(comm, na, nev, nblk,
+                                                dtype=dtype)
+        assert(a.data.dtype == dtype)
+        assert(a.data.shape == (a.na_rows, a.na_cols))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_distributed_matrix_from_world(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.float32, np.complex64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        assert(a.data.dtype == dtype)
+        assert(a.data.shape == (a.na_rows, a.na_cols))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_distributed_matrix_like_other_matrix(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.float32, np.complex64, np.complex128]:
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        b = DistributedMatrix.like(a)
+        assert(a.na == b.na)
+        assert(a.nev == b.nev)
+        assert(a.nblk == b.nblk)
+        assert(a.data.dtype == b.data.dtype)
+        assert(a.data.shape == b.data.shape)
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_call_eigenvectors(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix, Elpa
+    from mpi4py import MPI
+
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.complex128]:
+        # create arrays
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        a.data[:, :] = np.random.rand(a.na_rows, a.na_cols).astype(dtype)
+        q = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        ev = np.zeros(na, dtype=np.float64)
+
+        e = Elpa.from_distributed_matrix(a)
+        e.eigenvectors(a.data, ev, q.data)
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_call_eigenvalues(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix, Elpa
+    from mpi4py import MPI
+
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.complex128]:
+        # create arrays
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        a.data[:, :] = np.random.rand(a.na_rows, a.na_cols).astype(dtype)
+        ev = np.zeros(na, dtype=np.float64)
+
+        e = Elpa.from_distributed_matrix(a)
+        e.eigenvalues(a.data, ev)
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_compare_eigenvalues_to_those_from_eigenvectors(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix, Elpa
+    from mpi4py import MPI
+
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.complex128]:
+        # create arrays
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        random_matrix = np.random.rand(a.na_rows, a.na_cols).astype(dtype)
+        a.data[:, :] = random_matrix
+        q = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        ev = np.zeros(na, dtype=np.float64)
+        ev2 = np.zeros(na, dtype=np.float64)
+
+        e = Elpa.from_distributed_matrix(a)
+        e.eigenvectors(a.data, ev, q.data)
+
+        a.data[:, :] = random_matrix
+        e.eigenvalues(a.data, ev2)
+
+        assert(np.allclose(ev, ev2))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_compare_eigenvalues_to_those_from_eigenvectors_self_functions(
+        na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        # create arrays
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        random_matrix = np.random.rand(a.na_rows, a.na_cols).astype(dtype)
+        a.data[:, :] = random_matrix
+        data = a.compute_eigenvectors()
+
+        a.data[:, :] = random_matrix
+        eigenvalues = a.compute_eigenvalues()
+
+        assert(np.allclose(data['eigenvalues'], eigenvalues))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_distributed_matrix_global_index(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        for local_row in range(a.na_rows):
+            for local_col in range(a.na_cols):
+                global_row, global_col = a.get_global_index(local_row,
+                                                            local_col)
+                l_row, l_col = a.get_local_index(global_row, global_col)
+                assert(global_row >= 0 and global_row < a.na)
+                assert(global_col >= 0 and global_col < a.na)
+                assert(local_row == l_row and local_col == l_col)
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_distributed_matrix_local_index(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        for global_row in range(a.na):
+            for global_col in range(a.na):
+                if not a.is_local_index(global_row, global_col):
+                    continue
+                local_row, local_col = a.get_local_index(global_row,
+                                                         global_col)
+                g_row, g_col = a.get_global_index(local_row, local_col)
+                assert(local_row >= 0 and local_row < a.na_rows)
+                assert(local_col >= 0 and local_col < a.na_cols)
+                assert(global_row == g_row and global_col == g_col)
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_distributed_matrix_indexing_loop(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        for local_row in range(a.na_rows):
+            for local_col in range(a.na_cols):
+                global_row, global_col = a.get_global_index(local_row,
+                                                            local_col)
+                a.data[local_row, local_col] = global_row*10 + global_col
+
+        for global_row in range(a.na):
+            for global_col in range(a.na):
+                if not a.is_local_index(global_row, global_col):
+                    continue
+                local_row, local_col = a.get_local_index(global_row,
+                                                         global_col)
+                assert(a.data[local_row, local_col] ==
+                       global_row*10 + global_col)
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_setting_global_matrix(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    layout_p = ProcessorLayout(comm)
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix(layout_p, na, nev, nblk, dtype=dtype)
+        # get global matrix that is equal on all cores
+        matrix = get_random_vector(na*na).reshape(na, na).astype(dtype)
+        a.set_data_from_global_matrix(matrix)
+
+        # check data
+        for global_row in range(a.na):
+            for global_col in range(a.na):
+                if not a.is_local_index(global_row, global_col):
+                    continue
+                local_row, local_col = a.get_local_index(global_row,
+                                                         global_col)
+                assert(a.data[local_row, local_col] ==
+                       matrix[global_row, global_col])
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_dot_product(na, nev, nblk):
+    import numpy as np
+    from pyelpa import ProcessorLayout, DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        # get global matrix and vector that is equal on all cores
+        matrix = get_random_vector(na*na).reshape(na, na).astype(dtype)
+        vector = get_random_vector(na).astype(dtype)
+
+        a.set_data_from_global_matrix(matrix)
+
+        product_distributed = a.dot(vector)
+        product_naive = a._dot_naive(vector)
+        product_serial = np.dot(matrix, vector)
+
+        assert(np.allclose(product_distributed, product_serial))
+        assert(np.allclose(product_distributed, product_naive))
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_dot_product_incompatible_size(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        # get global matrix and vector that is equal on all cores
+        matrix = get_random_vector(na*na).reshape(na, na).astype(dtype)
+        vector = get_random_vector(na*2).astype(dtype)
+
+        a.set_data_from_global_matrix(matrix)
+
+        with pytest.raises(ValueError):
+            product_distributed = a.dot(vector)
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_validate_eigenvectors(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        # get a symmetric/hermitian matrix
+        matrix = get_random_vector(na*na).reshape(na, na).astype(dtype)
+        matrix = 0.5*(matrix + np.conj(matrix.T))
+        a.set_data_from_global_matrix(matrix)
+
+        data = a.compute_eigenvectors()
+        eigenvalues = data['eigenvalues']
+        eigenvectors = data['eigenvectors']
+        # reset data of a
+        a.set_data_from_global_matrix(matrix)
+        for index in range(a.nev):
+            eigenvector = eigenvectors.get_column(index)
+            scaled_eigenvector = eigenvalues[index]*eigenvector
+            # test solution
+            assert(np.allclose(a.dot(eigenvector),
+                               scaled_eigenvector))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_validate_eigenvectors_to_numpy(na, nev, nblk):
+    import numpy as np
+    from numpy import linalg
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        # get a symmetric/hermitian matrix
+        matrix = get_random_vector(na*na).reshape(na, na).astype(dtype)
+        matrix = 0.5*(matrix + np.conj(matrix.T))
+        a.set_data_from_global_matrix(matrix)
+
+        data = a.compute_eigenvectors()
+        eigenvalues = data['eigenvalues']
+        eigenvectors = data['eigenvectors']
+
+        # get numpy solution
+        eigenvalues_np, eigenvectors_np = linalg.eigh(matrix)
+
+        assert(np.allclose(eigenvalues, eigenvalues_np))
+        for index in range(a.nev):
+            eigenvector = eigenvectors.get_column(index)
+            assert(np.allclose(eigenvector, eigenvectors_np[:, index]) or
+                   np.allclose(eigenvector, -eigenvectors_np[:, index]))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_accessing_matrix(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        matrix = get_random_vector(na*na).reshape(na, na).astype(dtype)
+        a.set_data_from_global_matrix(matrix)
+
+        for index in range(a.na):
+            column = a.get_column(index)
+            assert(np.allclose(column, matrix[:, index]))
+            row = a.get_row(index)
+            assert(np.allclose(row, matrix[index, :]))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_global_index_iterator(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        for i, j in a.global_indices():
+            assert(a.is_local_index(i, j))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_global_index_access(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        for i, j in a.global_indices():
+            x = dtype(i*j)
+            a.set_data_for_global_index(i, j, x)
+        for i, j in a.global_indices():
+            x = a.get_data_for_global_index(i, j)
+            assert(np.isclose(x, i*j))
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_global_block_iterator(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        for i, j, blk_i, blk_j in a.global_block_indices():
+            assert(a.is_local_index(i, j))
+            assert(blk_i <= nblk)
+            assert(blk_j <= nblk)
+            assert(i+blk_i <= na)
+            assert(j+blk_j <= na)
+
+
+@pytest.mark.parametrize("na,nev,nblk", parameter_list)
+def test_global_block_access(na, nev, nblk):
+    import numpy as np
+    from pyelpa import DistributedMatrix
+
+    for dtype in [np.float64, np.complex128]:
+        a = DistributedMatrix.from_comm_world(na, nev, nblk, dtype=dtype)
+        for i, j, blk_i, blk_j in a.global_block_indices():
+            x = np.arange(i, i+blk_i)[:, None] * np.arange(j, j+blk_j)[None, :]
+            a.set_block_for_global_index(i, j, blk_i, blk_j, x)
+        for i, j, blk_i, blk_j in a.global_block_indices():
+            original = np.arange(i, i+blk_i)[:, None] * np.arange(j, j+blk_j)[None, :]
+            x = a.get_block_for_global_index(i, j, blk_i, blk_j)
+            assert(np.allclose(x, original))
+        for i, j in a.global_indices():
+            x = a.get_data_for_global_index(i, j)
+            assert(np.isclose(x, i*j))
diff -Nru elpa-2016.05.001/README.md elpa-2019.11.001/README.md
--- elpa-2016.05.001/README.md	2016-05-20 07:40:52.000000000 +0000
+++ elpa-2019.11.001/README.md	2019-12-20 05:57:47.000000000 +0000
@@ -1,10 +1,33 @@
-# [Eigenvalue SoLvers for Petaflop-Applications (ELPA)] (http://elpa.mpcdf.mpg.de)
+# [Eigenvalue SoLvers for Petaflop-Applications (ELPA)](http://elpa.mpcdf.mpg.de)
 
 ## Current Release ##
 
-The current release is ELPA 2016.05.001
+The current release is ELPA 2019.11.001 The current supported API version
+is 20190501. This release supports the earliest API version 20170403.
 
-## About *ELPA*
+The old, obsolete legacy API will be deprecated in the future !
+Allready now, all new features of ELPA are only available with the new API. Thus, there
+is no reason to keep the legacy API arround for too long.
+
+The release ELPA 2018.11.001 was the last release, where the legacy API has been
+enabled by default (and can be disabled at build time).
+With release ELPA 2019.05.001 the legacy API is disabled by default, however,
+can be still switched on at build time.
+With the release ELPA 2019.11.001 the legacy API will be deprecated and
+not supported anymore.
+
+[![Build 
+status](https://gitlab.mpcdf.mpg.de/elpa/elpa/badges/master/build.svg)](https://gitlab.mpcdf.mpg.de/elpa/elpa/commits/master)
+
+[![Code 
+coverage](https://gitlab.mpcdf.mpg.de/elpa/badges/master/coverage.svg)](http://elpa.pages.mpcdf.de/elpa/coverage_summary)
+
+![License LGPL v3][license-badge]
+
+[license-badge]: https://img.shields.io/badge/License-LGPL%20v3-blue.svg
+
+
+## About *ELPA* ##
 
 The computation of selected or all eigenvalues and eigenvectors of a symmetric
 (Hermitian) matrix has high relevance for various scientific disciplines.
@@ -30,7 +53,7 @@
 - Technische Universität München, Lehrstuhl für Informatik mit
   Schwerpunkt Wissenschaftliches Rechnen ,
 - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-- Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
+- Max-Plack-Institut für Mathematik in den Naturwissenschaften,
   Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
   and
 - IBM Deutschland GmbH
@@ -42,9 +65,9 @@
 
 There exist several ways to obtain the *ELPA* library either as sources or pre-compiled packages:
 
-- official release tar-gz sources from the [*ELPA* webpage] (http://elpa.mpcdf.mpg.de/elpa-tar-archive)
-- from the [*ELPA* git repository] (https://gitlab.mpcdf.mpg.de/elpa/elpa)
-- as packaged software for several Linux distribtutions (e.g. Debian, Fedora, OpenSuse)
+- official release tar-gz sources from the *ELPA* [webpage](https://elpa.mpcdf.mpg.de/elpa-tar-archive)
+- from the *ELPA* [git repository](https://gitlab.mpcdf.mpg.de/elpa/elpa)
+- as packaged software for several Linux distributions (e.g. Debian, Fedora, OpenSuse)
 
 ## Terms of usage
 
@@ -55,6 +78,8 @@
 
 Nonetheless, we are grateful if you cite the following publications:
 
+  If you use ELPA in general:
+
   T. Auckenthaler, V. Blum, H.-J. Bungartz, T. Huckle, R. Johanni,
   L. Kr\"amer, B. Lang, H. Lederer, and P. R. Willems,
   "Parallel solution of partial symmetric eigenvalue problems from
@@ -68,10 +93,30 @@
   structure theory and computational science",
   Journal of Physics Condensed Matter, 26 (2014)
   doi:10.1088/0953-8984/26/21/213201
+  
+  If you use the GPU version of ELPA:
+
+  Kus, P; Marek, A.; Lederer, H.
+  "GPU Optimization of Large-Scale Eigenvalue Solver",
+  In: Radu F., Kumar K., Berre I., Nordbotten J., Pop I. (eds) 
+  Numerical Mathematics and Advanced Applications ENUMATH 2017. ENUMATH 2017. 
+  Lecture Notes in Computational Science and Engineering, vol 126. Springer, Cham
+  
+  If you use the new API and/or autotuning:
+ 
+  Kus, P.; Marek, A.; Koecher, S. S.; Kowalski H.-H.; Carbogno, Ch.; Scheurer, Ch.; Reuter, K.; Scheffler, M.; Lederer, H.
+  "Optimizations of the Eigenvaluesolvers in the ELPA Library",
+  Parallel Computing 85, 167-177 (2019)
+
+  If you use the new support for skew-symmetric matrices:
+  Benner, P.; Draxl, C.; Marek, A.; Penke C.; Vorwerk, C.;
+  "High Performance Solution of Skew-symmetric Eigenvalue Problems with Applications in Solving the Bethe-Salpeter Eigenvalue Problem",
+  https://arxiv.org/abs/1912.04062, submitted to Parallel Computing
+  
 
 ## Installation of the *ELPA* library
 
-*ELPA* is shipped with a standard autotools automake installation infrastruture.
+*ELPA* is shipped with a standard autotools automake installation infrastructure.
 Some other libraries are needed to install *ELPA* (the details depend on how you
 configure *ELPA*):
 
@@ -81,13 +126,12 @@
   - Scalapack routines
   - a working MPI library
 
-Please refer to the **INSTALL document** on details of the installation process and
+Please refer to the [INSTALL document](INSTALL.md) on details of the installation process and
 the possible configure options.
 
 ## Using *ELPA*
 
-Please have a look at the "**USERS_GUIDE**" file, to get a documentation or at the [online]
-(http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2016.05.001/html/index.html) doygen
+Please have a look at the [USERS_GUIDE](USERS_GUIDE.md) file, to get a documentation or at the [online](http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.11.001/html/index.html) doxygen
 documentation, where you find the definition of the interfaces.
 
 ## Contributing to *ELPA*
@@ -95,7 +139,7 @@
 It has been, and is, a tremendous effort to develop and maintain the
 *ELPA* library. A lot of things can still be done, but our man-power is limited.
 
-Thus every effort and help to improve the *ELPA* library is higly appreciated.
-For details please see the CONTRIBUTING document.
+Thus every effort and help to improve the *ELPA* library is highly appreciated.
+For details please see the [CONTRIBUTING](CONTRIBUTING.md) document.
 
 
diff -Nru elpa-2016.05.001/remove_xcompiler elpa-2019.11.001/remove_xcompiler
--- elpa-2016.05.001/remove_xcompiler	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/remove_xcompiler	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+import sys
+import os
+
+args = [q for q in sys.argv[1:] if q != "-Xcompiler"]
+os.execvp(args[0], args[0:])
diff -Nru elpa-2016.05.001/src/aligned_mem.F90 elpa-2019.11.001/src/aligned_mem.F90
--- elpa-2016.05.001/src/aligned_mem.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/src/aligned_mem.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,63 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Lorenz Huedepohl, MPCDF
-
-module aligned_mem
-  use, intrinsic :: iso_c_binding
-
-  interface
-    function posix_memalign(memptr, alignment, size) result(error) bind(C, name="posix_memalign")
-      import c_int, c_size_t, c_ptr
-      integer(kind=c_int) :: error
-      type(c_ptr), intent(inout) :: memptr
-      integer(kind=c_size_t), intent(in), value :: alignment, size
-    end function
-  end interface
-
-  interface
-    subroutine free(ptr) bind(C, name="free")
-      import c_ptr
-      type(c_ptr), value :: ptr
-    end subroutine
-  end interface
-
-end module
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_auxiliary.F90 elpa-2019.11.001/src/elpa1/elpa1_auxiliary.F90
--- elpa-2016.05.001/src/elpa1/elpa1_auxiliary.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_auxiliary.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,495 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! This file has been rewritten by A. Marek, MPCDF
+#include "config-f90.h"
+
+!> \brief Fortran module which provides helper routines for matrix calculations
+module elpa1_auxiliary_impl
+  use elpa_utilities
+
+  implicit none
+
+  public :: elpa_mult_at_b_real_double_impl      !< Multiply double-precision real matrices A**T * B
+
+  public :: elpa_mult_ah_b_complex_double_impl   !< Multiply double-precision complex matrices A**H * B
+
+  public :: elpa_invert_trm_real_double_impl    !< Invert double-precision real triangular matrix
+
+  public :: elpa_invert_trm_complex_double_impl  !< Invert double-precision complex triangular matrix
+
+  public :: elpa_cholesky_real_double_impl       !< Cholesky factorization of a double-precision real matrix
+
+  public :: elpa_cholesky_complex_double_impl    !< Cholesky factorization of a double-precision complex matrix
+
+  public :: elpa_solve_tridi_double_impl         !< Solve tridiagonal eigensystem for a double-precision matrix with divide and conquer method
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public :: elpa_cholesky_real_single_impl       !< Cholesky factorization of a single-precision real matrix
+  public :: elpa_invert_trm_real_single_impl     !< Invert single-precision real triangular matrix
+  public :: elpa_mult_at_b_real_single_impl      !< Multiply single-precision real matrices A**T * B
+  public :: elpa_solve_tridi_single_impl         !< Solve tridiagonal eigensystem for a single-precision matrix with divide and conquer method
+#endif
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public :: elpa_cholesky_complex_single_impl    !< Cholesky factorization of a single-precision complex matrix
+  public :: elpa_invert_trm_complex_single_impl  !< Invert single-precision complex triangular matrix
+  public :: elpa_mult_ah_b_complex_single_impl   !< Multiply single-precision complex matrices A**H * B
+#endif
+
+  contains
+
+#define REALCASE 1
+#define DOUBLE_PRECISION
+#include "../general/precision_macros.h"
+
+   function elpa_cholesky_real_double_impl (obj, a) result(success)
+#include "elpa_cholesky_template.F90"
+
+    end function elpa_cholesky_real_double_impl
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION
+#include "../general/precision_macros.h"
+
+   function elpa_cholesky_real_single_impl(obj, a) result(success)
+#include "elpa_cholesky_template.F90"
+
+    end function elpa_cholesky_real_single_impl
+#undef SINGLE_PRECISION
+#undef REALCASE
+
+#endif /* WANT_SINGLE_PRECSION_REAL */
+
+#define REALCASE 1
+#define DOUBLE_PRECISION
+#include "../general/precision_macros.h"
+!> \brief  elpa_invert_trm_real_double: Inverts a double-precision real upper triangular matrix
+!> \details
+!> \param  obj                    elpa_t object contains:
+!> \param     - obj%na            Order of matrix
+!> \param     - obj%local_nrows   Leading dimension of a
+!> \param     - obj%local_ncols   local columns of matrix a
+!> \param     - obj%nblk          blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows MPI communicator for rows
+!> \param     - obj%mpi_comm_cols MPI communicator for columns
+!> \param     - obj%wantDebug     logical, more debug information on failure
+!> \param  a(lda,matrixCols)      Distributed matrix which should be inverted
+!>                                Distribution is like in Scalapack.
+!>                                Only upper triangle needs to be set.
+!>                                The lower triangle is not referenced.
+!> \result succes                 logical, reports success or failure
+    function elpa_invert_trm_real_double_impl(obj, a) result(success)
+#include "elpa_invert_trm.F90"
+     end function elpa_invert_trm_real_double_impl
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+#if WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_invert_trm_real_single_impl: Inverts a single-precision real upper triangular matrix
+!> \details
+!> \param  obj                    elpa_t object contains:
+!> \param     - obj%na            Order of matrix
+!> \param     - obj%local_nrows   Leading dimension of a
+!> \param     - obj%local_ncols   local columns of matrix a
+!> \param     - obj%nblk          blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows MPI communicator for rows
+!> \param     - obj%mpi_comm_cols MPI communicator for columns
+!> \param     - obj%wantDebug     logical, more debug information on failure
+!> \param  a(lda,matrixCols)      Distributed matrix which should be inverted
+!>                                Distribution is like in Scalapack.
+!>                                Only upper triangle needs to be set.
+!>                                The lower triangle is not referenced.
+!> \result succes                 logical, reports success or failure
+
+    function elpa_invert_trm_real_single_impl(obj, a) result(success)
+#include "elpa_invert_trm.F90"
+    end function elpa_invert_trm_real_single_impl
+#undef SINGLE_PRECISION
+#undef REALCASE
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_cholesky_complex_double_impl: Cholesky factorization of a double-precision complex hermitian matrix
+!> \details
+!> \param  obj                    elpa_t object contains:
+!> \param     - obj%na            Order of matrix
+!> \param     - obj%local_nrows   Leading dimension of a
+!> \param     - obj%local_ncols   local columns of matrix a
+!> \param     - obj%nblk          blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows MPI communicator for rows
+!> \param     - obj%mpi_comm_cols MPI communicator for columns
+!> \param     - obj%wantDebug     logical, more debug information on failure
+!> \param  a(lda,matrixCols)      Distributed matrix which should be inverted
+!>                                Distribution is like in Scalapack.
+!>                                Only upper triangle needs to be set.
+!>                                The lower triangle is not referenced.
+!> \result succes                 logical, reports success or failure
+    function elpa_cholesky_complex_double_impl(obj, a) result(success)
+
+#include "elpa_cholesky_template.F90"
+
+    end function elpa_cholesky_complex_double_impl
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_cholesky_complex_single_impl: Cholesky factorization of a single-precision complex hermitian matrix
+!> \details
+!> \param  obj                    elpa_t object contains:
+!> \param     - obj%na            Order of matrix
+!> \param     - obj%local_nrows   Leading dimension of a
+!> \param     - obj%local_ncols   local columns of matrix a
+!> \param     - obj%nblk          blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows MPI communicator for rows
+!> \param     - obj%mpi_comm_cols MPI communicator for columns
+!> \param     - obj%wantDebug     logical, more debug information on failure
+!> \param  a(lda,matrixCols)      Distributed matrix which should be inverted
+!>                                Distribution is like in Scalapack.
+!>                                Only upper triangle needs to be set.
+!>                                The lower triangle is not referenced.
+!> \result succes                 logical, reports success or failure
+    function elpa_cholesky_complex_single_impl(obj, a) result(success)
+
+#include "elpa_cholesky_template.F90"
+
+    end function elpa_cholesky_complex_single_impl
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_invert_trm_complex_double_impl: Inverts a double-precision complex upper triangular matrix
+!> \details
+!> \param  obj                    elpa_t object contains:
+!> \param     - obj%na            Order of matrix
+!> \param     - obj%local_nrows   Leading dimension of a
+!> \param     - obj%local_ncols   local columns of matrix a
+!> \param     - obj%nblk          blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows MPI communicator for rows
+!> \param     - obj%mpi_comm_cols MPI communicator for columns
+!> \param     - obj%wantDebug     logical, more debug information on failure
+!> \param  a(lda,matrixCols)      Distributed matrix which should be inverted
+!>                                Distribution is like in Scalapack.
+!>                                Only upper triangle needs to be set.
+!>                                The lower triangle is not referenced.
+!> \result succes                 logical, reports success or failure
+     function elpa_invert_trm_complex_double_impl(obj, a) result(success)
+#include "elpa_invert_trm.F90"
+    end function elpa_invert_trm_complex_double_impl
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_invert_trm_complex_single_impl: Inverts a single-precision complex upper triangular matrix
+!> \details
+!> \param  obj                    elpa_t object contains:
+!> \param     - obj%na            Order of matrix
+!> \param     - obj%local_nrows   Leading dimension of a
+!> \param     - obj%local_ncols   local columns of matrix a
+!> \param     - obj%nblk          blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows MPI communicator for rows
+!> \param     - obj%mpi_comm_cols MPI communicator for columns
+!> \param     - obj%wantDebug     logical, more debug information on failure
+!> \param  a(lda,matrixCols)      Distributed matrix which should be inverted
+!>                                Distribution is like in Scalapack.
+!>                                Only upper triangle needs to be set.
+!>                                The lower triangle is not referenced.
+!> \result succes                 logical, reports success or failure
+    function elpa_invert_trm_complex_single_impl(obj, a) result(success)
+#include "elpa_invert_trm.F90"
+    end function elpa_invert_trm_complex_single_impl
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
+#endif /* WANT_SINGE_PRECISION_COMPLEX */
+
+#define REALCASE 1
+#define DOUBLE_PRECISION
+#include "../general/precision_macros.h"
+    function elpa_mult_at_b_real_double_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
+                                             c, ldc, ldcCols) result(success)
+#include "elpa_multiply_a_b.F90"
+    end function elpa_mult_at_b_real_double_impl
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+#if WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_mult_at_b_real_single_impl: Performs C : = A**T * B
+!>         where   A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
+!>                 B is a (obj%na,ncb) matrix
+!>                 C is a (obj%na,ncb) matrix where optionally only the upper or lower
+!>                   triangle may be computed
+!> \details
+
+!> \param  uplo_a               'U' if A is upper triangular
+!>                              'L' if A is lower triangular
+!>                              anything else if A is a full matrix
+!>                              Please note: This pertains to the original A (as set in the calling program)
+!>                                           whereas the transpose of A is used for calculations
+!>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
+!>                              i.e. it may contain arbitrary numbers
+!> \param uplo_c                'U' if only the upper diagonal part of C is needed
+!>                              'L' if only the upper diagonal part of C is needed
+!>                              anything else if the full matrix C is needed
+!>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
+!>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
+!> \param na                    Number of rows/columns of A, number of rows of B and C
+!> \param ncb                   Number of columns  of B and C
+!> \param a                     matrix a
+!> \param obj%local_nrows       leading dimension of matrix a, set with class method obj%set("local_nrows",value)
+!> \param b                     matrix b
+!> \param ldb                   leading dimension of matrix b
+!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
+!> \param  mpi_comm_rows        MPI communicator for rows
+!> \param  mpi_comm_cols        MPI communicator for columns
+!> \param c                     matrix c
+!> \param ldc                   leading dimension of matrix c
+!> \result success
+
+    function elpa_mult_at_b_real_single_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
+                                             c, ldc, ldcCols) result(success)
+
+#include "elpa_multiply_a_b.F90"
+
+    end function elpa_mult_at_b_real_single_impl
+#undef SINGLE_PRECISION
+#undef REALCASE
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_mult_ah_b_complex_double_impl: Performs C : = A**H * B
+!>         where   A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
+!>                 B is a (obj%na,ncb) matrix
+!>                 C is a (obj%na,ncb) matrix where optionally only the upper or lower
+!>                   triangle may be computed
+!> \details
+!>
+!> \param  uplo_a               'U' if A is upper triangular
+!>                              'L' if A is lower triangular
+!>                              anything else if A is a full matrix
+!>                              Please note: This pertains to the original A (as set in the calling program)
+!>                                           whereas the transpose of A is used for calculations
+!>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
+!>                              i.e. it may contain arbitrary numbers
+!> \param uplo_c                'U' if only the upper diagonal part of C is needed
+!>                              'L' if only the upper diagonal part of C is needed
+!>                              anything else if the full matrix C is needed
+!>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
+!>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
+!> \param na                    Number of rows/columns of A, number of rows of B and C
+!> \param ncb                   Number of columns  of B and C
+!> \param a                     matrix a
+!> \param obj%local_ncols       leading dimension of matrix a, set with class method obj%set("local_nrows",value)
+!> \param ldaCols               columns of matrix a
+!> \param b                     matrix b
+!> \param ldb                   leading dimension of matrix b
+!> \param ldbCols               columns of matrix b
+!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
+!> \param  mpi_comm_rows        MPI communicator for rows
+!> \param  mpi_comm_cols        MPI communicator for columns
+!> \param c                     matrix c
+!> \param ldc                   leading dimension of matrix c
+!> \result success
+
+    function elpa_mult_ah_b_complex_double_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
+                                                c, ldc, ldcCols) result(success)
+#include "elpa_multiply_a_b.F90"
+
+    end function elpa_mult_ah_b_complex_double_impl
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_mult_ah_b_complex_single_impl: Performs C : = A**H * B
+!>         where   A is a square matrix (obj%na,obj%na) which is optionally upper or lower triangular
+!>                 B is a (obj%na,ncb) matrix
+!>                 C is a (obj%na,ncb) matrix where optionally only the upper or lower
+!>                   triangle may be computed
+!> \details
+!>
+!> \param  uplo_a               'U' if A is upper triangular
+!>                              'L' if A is lower triangular
+!>                              anything else if A is a full matrix
+!>                              Please note: This pertains to the original A (as set in the calling program)
+!>                                           whereas the transpose of A is used for calculations
+!>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
+!>                              i.e. it may contain arbitrary numbers
+!> \param uplo_c                'U' if only the upper diagonal part of C is needed
+!>                              'L' if only the upper diagonal part of C is needed
+!>                              anything else if the full matrix C is needed
+!>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
+!>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
+!> \param na                    Number of rows/columns of A, number of rows of B and C
+!> \param ncb                   Number of columns  of B and C
+!> \param a                     matrix a
+!> \param lda                   leading dimension of matrix a
+!> \param ldaCols               columns of matrix a
+!> \param b                     matrix b
+!> \param ldb                   leading dimension of matrix b
+!> \param ldbCols               columns of matrix b
+!> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
+!> \param  mpi_comm_rows        MPI communicator for rows
+!> \param  mpi_comm_cols        MPI communicator for columns
+!> \param c                     matrix c
+!> \param ldc                   leading dimension of matrix c
+!> \result success
+
+    function elpa_mult_ah_b_complex_single_impl(obj, uplo_a, uplo_c, ncb, a, b, ldb, ldbCols, &
+                                                c, ldc, ldcCols) result(success)
+
+#include "elpa_multiply_a_b.F90"
+
+    end function elpa_mult_ah_b_complex_single_impl
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+#define REALCASE 1
+#define DOUBLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_solve_tridi_double_impl: Solve tridiagonal eigensystem for a double-precision matrix with divide and conquer method
+!> \details
+!> \param  obj                    elpa_t object contains:
+!> \param     - obj%na            Order of matrix
+!> \param     - obj%nev           number of eigenvalues/vectors to be computed
+!> \param     - obj%local_nrows   Leading dimension of q
+!> \param     - obj%local_ncols   local columns of matrix q
+!> \param     - obj%nblk          blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows MPI communicator for rows
+!> \param     - obj%mpi_comm_cols MPI communicator for columns
+!> \param     - obj%wantDebug     logical, more debug information on failure
+!> \param d                       array d(na) on input diagonal elements of tridiagonal matrix, on
+!>                                output the eigenvalues in ascending order
+!> \param e                       array e(na) on input subdiagonal elements of matrix, on exit destroyed
+!> \param q                       on exit : matrix q(ldq,matrixCols) contains the eigenvectors
+!> \result succes                 logical, reports success or failure
+    function elpa_solve_tridi_double_impl(obj, d, e, q) result(success)
+
+#include "elpa_solve_tridi_impl_public.F90"
+
+    end function
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION
+#include "../general/precision_macros.h"
+
+!> \brief  elpa_solve_tridi_single_impl: Solve tridiagonal eigensystem for a single-precision matrix with divide and conquer method
+!> \details
+!> \param  obj                    elpa_t object contains:
+!> \param     - obj%na            Order of matrix
+!> \param     - obj%nev           number of eigenvalues/vectors to be computed
+!> \param     - obj%local_nrows   Leading dimension of q
+!> \param     - obj%local_ncols   local columns of matrix q
+!> \param     - obj%nblk          blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows MPI communicator for rows
+!> \param     - obj%mpi_comm_cols MPI communicator for columns
+!> \param     - obj%wantDebug     logical, more debug information on failure
+!> \param d                       array d(na) on input diagonal elements of tridiagonal matrix, on
+!>                                output the eigenvalues in ascending order
+!> \param e                       array e(na) on input subdiagonal elements of matrix, on exit destroyed
+!> \param q                       on exit : matrix q(ldq,matrixCols) contains the eigenvectors
+!> \result succes                 logical, reports success or failure
+    function elpa_solve_tridi_single_impl(obj, d, e, q) result(success)
+
+#include "elpa_solve_tridi_impl_public.F90"
+
+    end function
+#undef SINGLE_PRECISION
+#undef REALCASE
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+
+
+
+end module elpa1_auxiliary_impl
+
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_compute_private.F90 elpa-2019.11.001/src/elpa1/elpa1_compute_private.F90
--- elpa-2016.05.001/src/elpa1/elpa1_compute_private.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_compute_private.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,272 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+#include "config-f90.h"
+!> \brief Fortran module which contains the source of ELPA 1stage
+module elpa1_compute
+  use elpa_utilities
+  use elpa_mpi
+  implicit none
+
+  PRIVATE ! set default to private
+
+  public :: tridiag_real_double               ! Transform real symmetric matrix to tridiagonal form
+  public :: tridiag_real
+  public :: trans_ev_real_double              ! Transform real eigenvectors of a tridiagonal matrix back
+  public :: trans_ev_real
+
+  public :: solve_tridi_double
+  public :: solve_tridi_double_impl
+
+  interface tridiag_real
+    module procedure tridiag_real_double
+  end interface
+
+  interface trans_ev_real
+    module procedure trans_ev_real_double
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public :: tridiag_real_single        ! Transform real single-precision symmetric matrix to tridiagonal form
+  public :: trans_ev_real_single       ! Transform real  single-precision eigenvectors of a tridiagonal matrix back
+  public :: solve_tridi_single
+  public :: solve_tridi_single_impl
+#endif
+
+  public :: tridiag_complex_double            ! Transform complex hermitian matrix to tridiagonal form
+  public :: tridiag_complex
+  public :: trans_ev_complex_double           ! Transform eigenvectors of a tridiagonal matrix back
+  public :: trans_ev_complex
+
+  interface tridiag_complex
+    module procedure tridiag_complex_double
+  end interface
+
+  interface trans_ev_complex
+    module procedure trans_ev_complex_double
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public :: tridiag_complex_single     ! Transform complex single-precision hermitian matrix to tridiagonal form
+  public :: trans_ev_complex_single    ! Transform complex single-precision eigenvectors of a tridiagonal matrix back
+#endif
+
+  public :: hh_transform_real_double
+  public :: hh_transform_real
+  public :: elpa_reduce_add_vectors_real_double
+  public :: elpa_reduce_add_vectors_real
+  public :: elpa_transpose_vectors_real_double
+  public :: elpa_transpose_vectors_ss_real_double
+  public :: elpa_transpose_vectors_real
+  public :: elpa_transpose_vectors_ss_real
+
+  interface hh_transform_real
+    module procedure hh_transform_real_double
+  end interface
+
+  interface elpa_reduce_add_vectors_real
+    module procedure elpa_reduce_add_vectors_real_double
+  end interface
+
+  interface elpa_transpose_vectors_real
+    module procedure elpa_transpose_vectors_real_double
+  end interface
+  
+  interface elpa_transpose_vectors_ss_real
+    module procedure elpa_transpose_vectors_ss_real_double
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public :: hh_transform_real_single
+  public :: elpa_reduce_add_vectors_real_single
+  public :: elpa_transpose_vectors_real_single
+  public :: elpa_transpose_vectors_ss_real_single
+#endif
+
+  public :: hh_transform_complex_double
+  public :: hh_transform_complex
+  public :: elpa_reduce_add_vectors_complex_double
+  public :: elpa_reduce_add_vectors_complex
+  public :: elpa_transpose_vectors_complex_double
+  public :: elpa_transpose_vectors_ss_complex_double
+  public :: elpa_transpose_vectors_complex
+  public :: elpa_transpose_vectors_ss_complex
+
+  interface hh_transform_complex
+    module procedure hh_transform_complex_double
+  end interface
+
+  interface elpa_reduce_add_vectors_complex
+    module procedure elpa_reduce_add_vectors_complex_double
+  end interface
+
+  interface elpa_transpose_vectors_complex
+    module procedure elpa_transpose_vectors_complex_double
+  end interface
+  
+  interface elpa_transpose_vectors_ss_complex
+    module procedure elpa_transpose_vectors_ss_complex_double
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public :: hh_transform_complex_single
+  public :: elpa_reduce_add_vectors_complex_single
+  public :: elpa_transpose_vectors_complex_single
+  public :: elpa_transpose_vectors_ss_complex_single
+#endif
+
+  contains
+
+! real double precision first
+#define DOUBLE_PRECISION_REAL 1
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+
+
+#include "elpa_transpose_vectors.F90"
+#define SKEW_SYMMETRIC_BUILD
+#include "elpa_transpose_vectors.F90"
+#undef SKEW_SYMMETRIC_BUILD
+#include "elpa_reduce_add_vectors.F90"
+#undef DOUBLE_PRECISION
+#undef REALCASE
+! single precision
+#ifdef WANT_SINGLE_PRECISION_REAL
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+
+#include "elpa_transpose_vectors.F90"
+#define SKEW_SYMMETRIC_BUILD
+#include "elpa_transpose_vectors.F90"
+#undef SKEW_SYMMETRIC_BUILD
+#include "elpa_reduce_add_vectors.F90"
+#undef SINGLE_PRECISION
+#undef REALCASE
+#endif
+
+! double precision
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa_transpose_vectors.F90"
+#define SKEW_SYMMETRIC_BUILD
+#include "elpa_transpose_vectors.F90"
+#undef SKEW_SYMMETRIC_BUILD
+#include "elpa_reduce_add_vectors.F90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa_transpose_vectors.F90"
+#define SKEW_SYMMETRIC_BUILD
+#include "elpa_transpose_vectors.F90"
+#undef SKEW_SYMMETRIC_BUILD
+#include "elpa_reduce_add_vectors.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+! real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa1_compute_template.F90"
+
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+! real single precision
+#if defined(WANT_SINGLE_PRECISION_REAL)
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+
+#include "../general/precision_macros.h"
+#include "elpa1_compute_template.F90"
+
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+! complex double precision
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa1_compute_template.F90"
+
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+! complex single precision
+#if defined(WANT_SINGLE_PRECISION_COMPLEX)
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa1_compute_template.F90"
+
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+end module elpa1_compute
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_compute_template.F90 elpa-2019.11.001/src/elpa1/elpa1_compute_template.F90
--- elpa-2016.05.001/src/elpa1/elpa1_compute_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_compute_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,105 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: Andreas Marek, MPCDF
+#endif
+
+#include "../general/sanity.F90"
+
+#if REALCASE == 1
+
+!cannot use __FILE__ because filename with path can be too long for gfortran (max line length)
+#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
+#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
+#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
+
+#endif
+
+#if REALCASE == 1
+
+#include "elpa1_tridiag_template.F90"
+#include "elpa1_trans_ev_template.F90"
+
+! now comes a dirty hack:
+! the file elpa1_solve_tridi_real_template.F90 must be included twice
+! for the legacy and for the new API. In the new API, however, some routines
+! must be named "..._impl"
+
+#ifdef DOUBLE_PRECISION_REAL
+#define PRECISION_AND_SUFFIX double
+#else
+#define PRECISION_AND_SUFFIX single
+#endif
+#include "elpa1_solve_tridi_real_template.F90"
+#undef PRECISION_AND_SUFFIX
+#ifdef DOUBLE_PRECISION_REAL
+#define PRECISION_AND_SUFFIX  double_impl
+#else
+#define PRECISION_AND_SUFFIX  single_impl
+#endif
+#include "elpa1_solve_tridi_real_template.F90"
+#undef PRECISION_AND_SUFFIX
+#include "elpa1_merge_systems_real_template.F90"
+#include "elpa1_tools_template.F90"
+
+#endif
+
+#if COMPLEXCASE == 1
+
+#include "elpa1_tridiag_template.F90"
+#include "elpa1_trans_ev_template.F90"
+#include "elpa1_tools_template.F90"
+
+#define ALREADY_DEFINED 1
+
+#endif
diff -Nru elpa-2016.05.001/src/elpa1/elpa1.F90 elpa-2019.11.001/src/elpa1/elpa1.F90
--- elpa-2016.05.001/src/elpa1/elpa1.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,289 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+!> \mainpage
+!> Eigenvalue SoLvers for Petaflop-Applications (ELPA)
+!> \par
+!> http://elpa.mpcdf.mpg.de
+!>
+!> \par
+!>    The ELPA library was originally created by the ELPA consortium,
+!>    consisting of the following organizations:
+!>
+!>    - Max Planck Computing and Data Facility (MPCDF) formerly known as
+!>      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!>    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!>      Informatik,
+!>    - Technische Universität München, Lehrstuhl für Informatik mit
+!>      Schwerpunkt Wissenschaftliches Rechnen ,
+!>    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!>    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!>      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!>      and
+!>    - IBM Deutschland GmbH
+!>
+!>   Some parts and enhancements of ELPA have been contributed and authored
+!>   by the Intel Corporation which is not part of the ELPA consortium.
+!>
+!>   Contributions to the ELPA source have been authored by (in alphabetical order):
+!>
+!> \author T. Auckenthaler, Volker Blum, A. Heinecke, L. Huedepohl, R. Johanni, Werner Jürgens, and A. Marek
+
+
+#include "config-f90.h"
+
+!> \brief Fortran module which provides the routines to use the one-stage ELPA solver
+module elpa1_impl
+  use, intrinsic :: iso_c_binding
+  use elpa_utilities
+  use elpa1_auxiliary_impl
+#ifdef HAVE_LIKWID
+  use likwid
+#endif
+
+  implicit none
+
+  ! The following routines are public:
+  private
+
+  public :: elpa_solve_evp_real_1stage_double_impl    !< Driver routine for real double-precision 1-stage eigenvalue problem
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public :: elpa_solve_evp_real_1stage_single_impl    !< Driver routine for real single-precision 1-stage eigenvalue problem
+
+#endif
+  public :: elpa_solve_evp_complex_1stage_double_impl !< Driver routine for complex 1-stage eigenvalue problem
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public :: elpa_solve_evp_complex_1stage_single_impl !< Driver routine for complex 1-stage eigenvalue problem
+#endif
+
+  ! imported from elpa1_auxilliary
+
+  public :: elpa_mult_at_b_real_double_impl       !< Multiply double-precision real matrices A**T * B
+
+  public :: elpa_mult_ah_b_complex_double_impl    !< Multiply double-precision complex matrices A**H * B
+
+  public :: elpa_invert_trm_real_double_impl      !< Invert double-precision real triangular matrix
+
+  public :: elpa_invert_trm_complex_double_impl   !< Invert double-precision complex triangular matrix
+
+  public :: elpa_cholesky_real_double_impl        !< Cholesky factorization of a double-precision real matrix
+
+  public :: elpa_cholesky_complex_double_impl     !< Cholesky factorization of a double-precision complex matrix
+
+  public :: elpa_solve_tridi_double_impl          !< Solve a double-precision tridiagonal eigensystem with divide and conquer method
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public :: elpa_mult_at_b_real_single_impl       !< Multiply single-precision real matrices A**T * B
+  public :: elpa_invert_trm_real_single_impl      !< Invert single-precision real triangular matrix
+  public :: elpa_cholesky_real_single_impl        !< Cholesky factorization of a single-precision real matrix
+  public :: elpa_solve_tridi_single_impl          !< Solve a single-precision tridiagonal eigensystem with divide and conquer method
+#endif
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public :: elpa_mult_ah_b_complex_single_impl    !< Multiply single-precision complex matrices A**H * B
+  public :: elpa_invert_trm_complex_single_impl   !< Invert single-precision complex triangular matrix
+  public :: elpa_cholesky_complex_single_impl     !< Cholesky factorization of a single-precision complex matrix
+#endif
+
+contains
+
+
+!> \brief elpa_solve_evp_real_1stage_double_impl: Fortran function to solve the real double-precision eigenvalue problem with 1-stage solver
+!>
+!> \details
+!> \param  obj                      elpa_t object contains:
+!> \param     - obj%na              Order of matrix
+!> \param     - obj%nev             number of eigenvalues/vectors to be computed
+!>                                  The smallest nev eigenvalues/eigenvectors are calculated.
+!> \param     - obj%local_nrows     Leading dimension of a
+!> \param     - obj%local_ncols     local columns of matrix q
+!> \param     - obj%nblk            blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows   MPI communicator for rows
+!> \param     - obj%mpi_comm_cols   MPI communicator for columns
+!> \param     - obj%mpi_comm_parent MPI communicator for columns
+!> \param     - obj%gpu             use GPU version (1 or 0)
+!>
+!> \param  a(lda,matrixCols)        Distributed matrix for which eigenvalues are to be computed.
+!>                                  Distribution is like in Scalapack.
+!>                                  The full matrix must be set (not only one half like in scalapack).
+!>                                  Destroyed on exit (upper and lower half).
+!>
+!>  \param ev(na)                   On output: eigenvalues of a, every processor gets the complete set
+!>
+!>  \param q(ldq,matrixCols)        On output: Eigenvectors of a
+!>                                  Distribution is like in Scalapack.
+!>                                  Must be always dimensioned to the full size (corresponding to (na,na))
+!>                                  even if only a part of the eigenvalues is needed.
+!>
+!>
+!>  \result                       success
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa1_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+!> \brief elpa_solve_evp_real_1stage_single_impl: Fortran function to solve the real single-precision eigenvalue problem with 1-stage solver
+!> \details
+!> \param  obj                      elpa_t object contains:
+!> \param     - obj%na              Order of matrix
+!> \param     - obj%nev             number of eigenvalues/vectors to be computed
+!>                                  The smallest nev eigenvalues/eigenvectors are calculated.
+!> \param     - obj%local_nrows     Leading dimension of a
+!> \param     - obj%local_ncols     local columns of matrix q
+!> \param     - obj%nblk            blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows   MPI communicator for rows
+!> \param     - obj%mpi_comm_cols   MPI communicator for columns
+!> \param     - obj%mpi_comm_parent MPI communicator for columns
+!> \param     - obj%gpu             use GPU version (1 or 0)
+!>
+!> \param  a(lda,matrixCols)        Distributed matrix for which eigenvalues are to be computed.
+!>                                  Distribution is like in Scalapack.
+!>                                  The full matrix must be set (not only one half like in scalapack).
+!>                                  Destroyed on exit (upper and lower half).
+!>
+!>  \param ev(na)                   On output: eigenvalues of a, every processor gets the complete set
+!>
+!>  \param q(ldq,matrixCols)        On output: Eigenvectors of a
+!>                                  Distribution is like in Scalapack.
+!>                                  Must be always dimensioned to the full size (corresponding to (na,na))
+!>                                  even if only a part of the eigenvalues is needed.
+!>
+!>
+!>  \result                       success
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa1_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+!> \brief elpa_solve_evp_complex_1stage_double_impl: Fortran function to solve the complex double-precision eigenvalue problem with 1-stage solver
+!> \details
+!> \param  obj                      elpa_t object contains:
+!> \param     - obj%na              Order of matrix
+!> \param     - obj%nev             number of eigenvalues/vectors to be computed
+!>                                  The smallest nev eigenvalues/eigenvectors are calculated.
+!> \param     - obj%local_nrows     Leading dimension of a
+!> \param     - obj%local_ncols     local columns of matrix q
+!> \param     - obj%nblk            blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows   MPI communicator for rows
+!> \param     - obj%mpi_comm_cols   MPI communicator for columns
+!> \param     - obj%mpi_comm_parent MPI communicator for columns
+!> \param     - obj%gpu             use GPU version (1 or 0)
+!>
+!> \param  a(lda,matrixCols)        Distributed matrix for which eigenvalues are to be computed.
+!>                                  Distribution is like in Scalapack.
+!>                                  The full matrix must be set (not only one half like in scalapack).
+!>                                  Destroyed on exit (upper and lower half).
+!>
+!>  \param ev(na)                   On output: eigenvalues of a, every processor gets the complete set
+!>
+!>  \param q(ldq,matrixCols)        On output: Eigenvectors of a
+!>                                  Distribution is like in Scalapack.
+!>                                  Must be always dimensioned to the full size (corresponding to (na,na))
+!>                                  even if only a part of the eigenvalues is needed.
+!>
+!>
+!>  \result                       success
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa1_template.F90"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+!> \brief elpa_solve_evp_complex_1stage_single_impl: Fortran function to solve the complex single-precision eigenvalue problem with 1-stage solver
+!> \details
+!> \param  obj                      elpa_t object contains:
+!> \param     - obj%na              Order of matrix
+!> \param     - obj%nev             number of eigenvalues/vectors to be computed
+!>                                  The smallest nev eigenvalues/eigenvectors are calculated.
+!> \param     - obj%local_nrows     Leading dimension of a
+!> \param     - obj%local_ncols     local columns of matrix q
+!> \param     - obj%nblk            blocksize of cyclic distribution, must be the same in both directions!
+!> \param     - obj%mpi_comm_rows   MPI communicator for rows
+!> \param     - obj%mpi_comm_cols   MPI communicator for columns
+!> \param     - obj%mpi_comm_parent MPI communicator for columns
+!> \param     - obj%gpu             use GPU version (1 or 0)
+!>
+!> \param  a(lda,matrixCols)        Distributed matrix for which eigenvalues are to be computed.
+!>                                  Distribution is like in Scalapack.
+!>                                  The full matrix must be set (not only one half like in scalapack).
+!>                                  Destroyed on exit (upper and lower half).
+!>
+!>  \param ev(na)                   On output: eigenvalues of a, every processor gets the complete set
+!>
+!>  \param q(ldq,matrixCols)        On output: Eigenvectors of a
+!>                                  Distribution is like in Scalapack.
+!>                                  Must be always dimensioned to the full size (corresponding to (na,na))
+!>                                  even if only a part of the eigenvalues is needed.
+!>
+!>
+!>  \result                       success
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION
+#include "../general/precision_macros.h"
+#include "elpa1_template.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+end module ELPA1_impl
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_merge_systems_real_template.F90 elpa-2019.11.001/src/elpa1/elpa1_merge_systems_real_template.F90
--- elpa-2016.05.001/src/elpa1/elpa1_merge_systems_real_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_merge_systems_real_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1250 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#include "../general/sanity.F90"
+
+    subroutine merge_systems_&
+    &PRECISION &
+                         (obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
+                          l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success, max_threads)
+      use cuda_functions
+      use iso_c_binding
+      use precision
+      use elpa_abstract_impl
+      use elpa_blas_interfaces
+
+#ifdef WITH_OPENMP
+      use omp_lib
+#endif
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout)  :: obj
+      integer(kind=ik), intent(in)                :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
+                                                     mpi_comm_cols, npc_0, npc_n
+      integer(kind=ik), intent(in)                :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
+      real(kind=REAL_DATATYPE), intent(inout)     :: d(na), e
+#ifdef USE_ASSUMED_SIZE
+      real(kind=REAL_DATATYPE), intent(inout)     :: q(ldq,*)
+#else
+      real(kind=REAL_DATATYPE), intent(inout)     :: q(ldq,matrixCols)
+#endif
+      logical, intent(in)                         :: useGPU, wantDebug
+      logical, intent(out)                        :: success
+
+      ! TODO: play with max_strip. If it was larger, matrices being multiplied
+      ! might be larger as well!
+      integer(kind=ik), parameter                 :: max_strip=128
+
+      
+      real(kind=REAL_DATATYPE)                    :: beta, sig, s, c, t, tau, rho, eps, tol, &
+                                                     qtrans(2,2), dmax, zmax, d1new, d2new
+      real(kind=REAL_DATATYPE)                    :: z(na), d1(na), d2(na), z1(na), delta(na),  &
+                                                     dbase(na), ddiff(na), ev_scale(na), tmp(na)
+      real(kind=REAL_DATATYPE)                    :: d1u(na), zu(na), d1l(na), zl(na)
+      real(kind=REAL_DATATYPE), allocatable       :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
+#ifdef WITH_OPENMP
+      real(kind=REAL_DATATYPE), allocatable       :: z_p(:,:)
+#endif
+
+      integer(kind=ik)                            :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
+                                                     l_rqm, ns, info
+      integer(kind=BLAS_KIND)                     :: infoBLAS
+      integer(kind=ik)                            :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
+                                                     l_cols_qreorg, np, l_idx, nqcols1, nqcols2
+      integer(kind=ik)                            :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
+                                                     np_cols
+      integer(kind=MPI_KIND)                      :: mpierr
+      integer(kind=MPI_KIND)                      :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
+      integer(kind=ik)                            :: np_next, np_prev, np_rem
+      integer(kind=ik)                            :: idx(na), idx1(na), idx2(na)
+      integer(kind=BLAS_KIND)                     :: idxBLAS(NA)
+      integer(kind=ik)                            :: coltyp(na), idxq1(na), idxq2(na)
+
+      integer(kind=ik)                            :: istat
+      character(200)                              :: errorMessage
+      integer(kind=ik)                            :: gemm_dim_k, gemm_dim_l, gemm_dim_m
+
+      integer(kind=C_intptr_T)                    :: qtmp1_dev, qtmp2_dev, ev_dev
+      logical                                     :: successCUDA
+      integer(kind=c_intptr_t), parameter         :: size_of_datatype = size_of_&
+                                                                      &PRECISION&
+                                                                      &_real
+      integer(kind=ik), intent(in)                :: max_threads
+#ifdef WITH_OPENMP
+      integer(kind=ik)                            :: my_thread
+
+      allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"merge_systems: error when allocating z_p "//errorMessage
+        stop 1
+      endif
+#endif
+
+      call obj%timer%start("merge_systems" // PRECISION_SUFFIX)
+      success = .true.
+      call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
+
+      my_prow = int(my_prowMPI,kind=c_int)
+      np_rows = int(np_rowsMPI,kind=c_int)
+      my_pcol = int(my_pcolMPI,kind=c_int)
+      np_cols = int(np_colsMPI,kind=c_int)
+
+      call obj%timer%stop("mpi_communication")
+
+      ! If my processor column isn't in the requested set, do nothing
+
+      if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
+        call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
+        return
+      endif
+      ! Determine number of "next" and "prev" column for ring sends
+
+      if (my_pcol == npc_0+npc_n-1) then
+        np_next = npc_0
+      else
+        np_next = my_pcol + 1
+      endif
+
+      if (my_pcol == npc_0) then
+        np_prev = npc_0+npc_n-1
+      else
+        np_prev = my_pcol - 1
+      endif
+      call check_monotony_&
+      &PRECISION&
+      &(obj, nm,d,'Input1',wantDebug, success)
+      if (.not.(success)) then
+        call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
+        return
+      endif
+      call check_monotony_&
+      &PRECISION&
+      &(obj,na-nm,d(nm+1),'Input2',wantDebug, success)
+      if (.not.(success)) then
+        call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
+        return
+      endif
+      ! Get global number of processors and my processor number.
+      ! Please note that my_proc does not need to match any real processor number,
+      ! it is just used for load balancing some loops.
+
+      n_procs = np_rows*npc_n
+      my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
+
+
+      ! Local limits of the rows of Q
+
+      l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
+      l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
+      l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
+
+      l_rnm  = l_rqm-l_rqs+1 ! Number of local rows <= nm
+      l_rows = l_rqe-l_rqs+1 ! Total number of local rows
+
+
+      ! My number of local columns
+
+      l_cols = COUNT(p_col(1:na)==my_pcol)
+
+      ! Get max number of local columns
+
+      max_local_cols = 0
+      do np = npc_0, npc_0+npc_n-1
+        max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np))
+      enddo
+
+      ! Calculations start here
+
+      beta = abs(e)
+      sig  = sign(1.0_rk,e)
+
+      ! Calculate rank-1 modifier z
+
+      z(:) = 0
+
+      if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then
+        ! nm is local on my row
+        do i = 1, na
+          if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
+         enddo
+      endif
+
+      if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then
+        ! nm+1 is local on my row
+        do i = 1, na
+          if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
+        enddo
+      endif
+
+      call global_gather_&
+      &PRECISION&
+      &(obj, z, na)
+      ! Normalize z so that norm(z) = 1.  Since z is the concatenation of
+      ! two normalized vectors, norm2(z) = sqrt(2).
+      z = z/sqrt(2.0_rk)
+      rho = 2.0_rk*beta
+      ! Calculate index for merging both systems by ascending eigenvalues
+      call obj%timer%start("blas")
+      call PRECISION_LAMRG( int(nm,kind=BLAS_KIND), int(na-nm,kind=BLAS_KIND), d, &
+                            1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
+      idx(:) = int(idxBLAS(:),kind=ik)
+      call obj%timer%stop("blas")
+
+! Calculate the allowable deflation tolerance
+
+      zmax = maxval(abs(z))
+      dmax = maxval(abs(d))
+      EPS = PRECISION_LAMCH( 'E' ) ! return epsilon
+      TOL = 8.0_rk*EPS*MAX(dmax,zmax)
+
+      ! If the rank-1 modifier is small enough, no more needs to be done
+      ! except to reorganize D and Q
+
+      IF ( RHO*zmax <= TOL ) THEN
+
+        ! Rearrange eigenvalues
+
+        tmp = d
+        do i=1,na
+          d(i) = tmp(idx(i))
+        enddo
+
+        ! Rearrange eigenvectors
+        call resort_ev_&
+        &PRECISION &
+                       (obj, idx, na)
+
+        call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
+
+        return
+      ENDIF
+
+      ! Merge and deflate system
+
+      na1 = 0
+      na2 = 0
+
+      ! COLTYP:
+      ! 1 : non-zero in the upper half only;
+      ! 2 : dense;
+      ! 3 : non-zero in the lower half only;
+      ! 4 : deflated.
+
+      coltyp(1:nm) = 1
+      coltyp(nm+1:na) = 3
+
+      do i=1,na
+
+        if (rho*abs(z(idx(i))) <= tol) then
+
+          ! Deflate due to small z component.
+
+          na2 = na2+1
+          d2(na2)   = d(idx(i))
+          idx2(na2) = idx(i)
+          coltyp(idx(i)) = 4
+
+        else if (na1>0) then
+
+          ! Check if eigenvalues are close enough to allow deflation.
+
+          S = Z(idx(i))
+          C = Z1(na1)
+
+          ! Find sqrt(a**2+b**2) without overflow or
+          ! destructive underflow.
+          TAU = PRECISION_LAPY2( C, S )
+          T = D1(na1) - D(idx(i))
+          C = C / TAU
+          S = -S / TAU
+          IF ( ABS( T*C*S ) <= TOL ) THEN
+
+            ! Deflation is possible.
+
+            na2 = na2+1
+
+            Z1(na1) = TAU
+
+            d2new = D(idx(i))*C**2 + D1(na1)*S**2
+            d1new = D(idx(i))*S**2 + D1(na1)*C**2
+
+            ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
+            ! This means that after the above transformation it must be
+            !    D1(na1) <= d1new <= D(idx(i))
+            !    D1(na1) <= d2new <= D(idx(i))
+            !
+            ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
+            ! so there is no problem with sorting here.
+            ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
+            ! which makes a check (and possibly a resort) necessary.
+            !
+            ! The above relations may not hold exactly due to numeric differences
+            ! so they have to be enforced in order not to get troubles with sorting.
+
+
+            if (d1new<D1(na1)  ) d1new = D1(na1)
+            if (d1new>D(idx(i))) d1new = D(idx(i))
+
+            if (d2new<D1(na1)  ) d2new = D1(na1)
+            if (d2new>D(idx(i))) d2new = D(idx(i))
+
+            D1(na1) = d1new
+
+            do j=na2-1,1,-1
+              if (d2new<d2(j)) then
+                d2(j+1)   = d2(j)
+                idx2(j+1) = idx2(j)
+              else
+                exit ! Loop
+              endif
+            enddo
+
+            d2(j+1)   = d2new
+            idx2(j+1) = idx(i)
+
+            qtrans(1,1) = C; qtrans(1,2) =-S
+            qtrans(2,1) = S; qtrans(2,2) = C
+            call transform_columns_&
+            &PRECISION &
+                        (obj, idx(i), idx1(na1))
+            if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
+            if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
+
+            coltyp(idx(i)) = 4
+
+          else
+            na1 = na1+1
+            d1(na1) = d(idx(i))
+            z1(na1) = z(idx(i))
+            idx1(na1) = idx(i)
+          endif
+        else
+          na1 = na1+1
+          d1(na1) = d(idx(i))
+          z1(na1) = z(idx(i))
+          idx1(na1) = idx(i)
+        endif
+
+      enddo
+      call check_monotony_&
+      &PRECISION&
+      &(obj, na1,d1,'Sorted1', wantDebug, success)
+      if (.not.(success)) then
+        call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
+        return
+      endif
+      call check_monotony_&
+      &PRECISION&
+      &(obj, na2,d2,'Sorted2', wantDebug, success)
+      if (.not.(success)) then
+        call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
+        return
+      endif
+
+      if (na1==1 .or. na1==2) then
+        ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
+
+        if (na1==1) then
+          d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
+        else ! na1==2
+          call obj%timer%start("blas")
+          call PRECISION_LAED5(1_BLAS_KIND, d1, z1, qtrans(1,1), rho, d(1))
+          call PRECISION_LAED5(2_BLAS_KIND, d1, z1, qtrans(1,2), rho, d(2))
+          call obj%timer%stop("blas")
+          call transform_columns_&
+          &PRECISION&
+          &(obj, idx1(1), idx1(2))
+        endif
+
+        ! Add the deflated eigenvalues
+        d(na1+1:na) = d2(1:na2)
+
+        ! Calculate arrangement of all eigenvalues  in output
+        call obj%timer%start("blas")
+        call PRECISION_LAMRG( int(na1,kind=BLAS_KIND), int(na-na1,kind=BLAS_KIND), d, &
+                              1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
+        idx(:) = int(idxBLAS(:),kind=ik)
+        call obj%timer%stop("blas")
+        ! Rearrange eigenvalues
+
+        tmp = d
+        do i=1,na
+          d(i) = tmp(idx(i))
+        enddo
+
+        ! Rearrange eigenvectors
+
+        do i=1,na
+          if (idx(i)<=na1) then
+            idxq1(i) = idx1(idx(i))
+          else
+            idxq1(i) = idx2(idx(i)-na1)
+          endif
+        enddo
+        call resort_ev_&
+        &PRECISION&
+        &(obj, idxq1, na)
+      else if (na1>2) then
+
+        ! Solve secular equation
+
+        z(1:na1) = 1
+#ifdef WITH_OPENMP
+        z_p(1:na1,:) = 1
+#endif
+        dbase(1:na1) = 0
+        ddiff(1:na1) = 0
+
+        info = 0
+        infoBLAS = int(info,kind=BLAS_KIND)
+!#ifdef WITH_OPENMP
+!
+!        call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
+!        my_thread = omp_get_thread_num()
+!!$OMP DO
+!#endif
+        DO i = my_proc+1, na1, n_procs ! work distributed over all processors
+          call obj%timer%start("blas")
+          call PRECISION_LAED4(int(na1,kind=BLAS_KIND), int(i,kind=BLAS_KIND), d1, z1, delta, &
+                               rho, s, infoBLAS) ! s is not used!
+          info = int(infoBLAS,kind=ik)
+          call obj%timer%stop("blas")
+          if (info/=0) then
+            ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
+            ! use the more stable bisection algorithm in solve_secular_equation
+            ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
+            call solve_secular_equation_&
+            &PRECISION&
+            &(obj, na1, i, d1, z1, delta, rho, s)
+          endif
+
+          ! Compute updated z
+
+!#ifdef WITH_OPENMP
+!          do j=1,na1
+!            if (i/=j)  z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
+!          enddo
+!          z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
+!#else
+          do j=1,na1
+            if (i/=j)  z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
+          enddo
+          z(i) = z(i)*delta(i)
+!#endif
+          ! store dbase/ddiff
+
+          if (i<na1) then
+            if (abs(delta(i+1)) < abs(delta(i))) then
+              dbase(i) = d1(i+1)
+              ddiff(i) = delta(i+1)
+            else
+              dbase(i) = d1(i)
+              ddiff(i) = delta(i)
+            endif
+          else
+            dbase(i) = d1(i)
+            ddiff(i) = delta(i)
+          endif
+        enddo
+!#ifdef WITH_OPENMP
+!!$OMP END PARALLEL
+!
+!        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+!
+!        do i = 0, max_threads-1
+!          z(1:na1) = z(1:na1)*z_p(1:na1,i)
+!        enddo
+!#endif
+
+        call global_product_&
+        &PRECISION&
+        (obj, z, na1)
+        z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
+
+        call global_gather_&
+        &PRECISION&
+        &(obj, dbase, na1)
+        call global_gather_&
+        &PRECISION&
+        &(obj, ddiff, na1)
+        d(1:na1) = dbase(1:na1) - ddiff(1:na1)
+
+        ! Calculate scale factors for eigenvectors
+        ev_scale(:) = 0.0_rk
+
+#ifdef WITH_OPENMP
+
+        call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+!$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs,  &
+!$OMP d1,dbase, ddiff, z, ev_scale, obj) &
+!$OMP DEFAULT(NONE)
+
+#endif
+        DO i = my_proc+1, na1, n_procs ! work distributed over all processors
+
+          ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code
+          ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
+
+          ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
+          ! in exactly this order, but we want to prevent compiler optimization
+!         ev_scale_val = ev_scale(i)
+          call add_tmp_&
+          &PRECISION&
+          &(obj, d1, dbase, ddiff, z, ev_scale(i), na1,i)
+!         ev_scale(i) = ev_scale_val
+        enddo
+#ifdef WITH_OPENMP
+!$OMP END PARALLEL DO
+
+        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+#endif
+
+        call global_gather_&
+        &PRECISION&
+        &(obj, ev_scale, na1)
+        ! Add the deflated eigenvalues
+        d(na1+1:na) = d2(1:na2)
+
+        call obj%timer%start("blas")
+        ! Calculate arrangement of all eigenvalues  in output
+        call PRECISION_LAMRG(int(na1,kind=BLAS_KIND), int(na-na1,kind=BLAS_KIND), d, &
+                             1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
+        idx(:) = int(idxBLAS(:),kind=ik)
+        call obj%timer%stop("blas")
+        ! Rearrange eigenvalues
+        tmp = d
+        do i=1,na
+          d(i) = tmp(idx(i))
+        enddo
+        call check_monotony_&
+        &PRECISION&
+        &(obj, na,d,'Output', wantDebug, success)
+
+        if (.not.(success)) then
+          call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
+          return
+        endif
+        ! Eigenvector calculations
+
+
+        ! Calculate the number of columns in the new local matrix Q
+        ! which are updated from non-deflated/deflated eigenvectors.
+        ! idxq1/2 stores the global column numbers.
+
+        nqcols1 = 0 ! number of non-deflated eigenvectors
+        nqcols2 = 0 ! number of deflated eigenvectors
+        DO i = 1, na
+          if (p_col_out(i)==my_pcol) then
+            if (idx(i)<=na1) then
+              nqcols1 = nqcols1+1
+              idxq1(nqcols1) = i
+            else
+              nqcols2 = nqcols2+1
+              idxq2(nqcols2) = i
+            endif
+          endif
+        enddo
+
+        gemm_dim_k = MAX(1,l_rows)
+        gemm_dim_l = max_local_cols
+        gemm_dim_m = MIN(max_strip,MAX(1,nqcols1))
+
+        allocate(qtmp1(gemm_dim_k, gemm_dim_l), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"merge_systems: error when allocating qtmp1 "//errorMessage
+          stop 1
+        endif
+
+        allocate(ev(gemm_dim_l,gemm_dim_m), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"merge_systems: error when allocating ev "//errorMessage
+          stop 1
+        endif
+
+        allocate(qtmp2(gemm_dim_k, gemm_dim_m), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"merge_systems: error when allocating qtmp2 "//errorMessage
+          stop 1
+        endif
+
+        qtmp1 = 0 ! May contain empty (unset) parts
+        qtmp2 = 0 ! Not really needed
+
+        if (useGPU) then
+          successCUDA = cuda_malloc(qtmp1_dev, gemm_dim_k * gemm_dim_l * size_of_datatype)
+          check_alloc_cuda("merge_systems: qtmp1_dev", successCUDA)
+
+          successCUDA = cuda_malloc(ev_dev, gemm_dim_l * gemm_dim_m * size_of_datatype)
+          check_alloc_cuda("merge_systems: ev_dev", successCUDA)
+
+          successCUDA = cuda_malloc(qtmp2_dev, gemm_dim_k * gemm_dim_m * size_of_datatype)
+          check_alloc_cuda("merge_systems: qtmp2_dev", successCUDA)
+
+          successCUDA = cuda_memset(qtmp1_dev, 0, gemm_dim_k * gemm_dim_l * size_of_datatype)
+          check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
+
+          successCUDA = cuda_memset(qtmp2_dev, 0, gemm_dim_k * gemm_dim_m * size_of_datatype)
+          check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
+        endif
+
+        ! Gather nonzero upper/lower components of old matrix Q
+        ! which are needed for multiplication with new eigenvectors
+
+        nnzu = 0
+        nnzl = 0
+        do i = 1, na1
+          l_idx = l_col(idx1(i))
+          if (p_col(idx1(i))==my_pcol) then
+            if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
+              nnzu = nnzu+1
+              qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
+            endif
+            if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
+              nnzl = nnzl+1
+              qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
+            endif
+          endif
+        enddo
+
+        ! Gather deflated eigenvalues behind nonzero components
+
+        ndef = max(nnzu,nnzl)
+        do i = 1, na2
+          l_idx = l_col(idx2(i))
+          if (p_col(idx2(i))==my_pcol) then
+            ndef = ndef+1
+            qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
+          endif
+        enddo
+
+        l_cols_qreorg = ndef ! Number of columns in reorganized matrix
+
+        ! Set (output) Q to 0, it will sum up new Q
+
+        DO i = 1, na
+          if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
+        enddo
+
+        np_rem = my_pcol
+
+        do np = 1, npc_n
+          ! Do a ring send of qtmp1
+
+          if (np>1) then
+
+            if (np_rem==npc_0) then
+              np_rem = npc_0+npc_n-1
+            else
+              np_rem = np_rem-1
+            endif
+#ifdef WITH_MPI
+            call obj%timer%start("mpi_communication")
+            call MPI_Sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=MPI_KIND), MPI_REAL_PRECISION,     &
+                                        int(np_next,kind=MPI_KIND), 1111_MPI_KIND, int(np_prev,kind=MPI_KIND), &
+                                        1111_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+            call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+          endif
+
+          if (useGPU) then
+            successCUDA = cuda_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
+                 gemm_dim_k * gemm_dim_l  * size_of_datatype, cudaMemcpyHostToDevice)
+            check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
+          endif
+
+          ! Gather the parts in d1 and z which are fitting to qtmp1.
+          ! This also delivers nnzu/nnzl for proc np_rem
+
+          nnzu = 0
+          nnzl = 0
+          do i=1,na1
+            if (p_col(idx1(i))==np_rem) then
+              if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
+                nnzu = nnzu+1
+                d1u(nnzu) = d1(i)
+                zu (nnzu) = z (i)
+              endif
+              if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
+                nnzl = nnzl+1
+                d1l(nnzl) = d1(i)
+                zl (nnzl) = z (i)
+              endif
+            endif
+          enddo
+
+          ! Set the deflated eigenvectors in Q (comming from proc np_rem)
+
+          ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix
+          do i = 1, na
+            j = idx(i)
+            if (j>na1) then
+              if (p_col(idx2(j-na1))==np_rem) then
+                ndef = ndef+1
+                if (p_col_out(i)==my_pcol) &
+                      q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
+              endif
+            endif
+          enddo
+
+          do ns = 0, nqcols1-1, max_strip ! strimining loop
+
+            ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip
+
+            ! Get partial result from (output) Q
+
+            do i = 1, ncnt
+              qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns)))
+            enddo
+
+            ! Compute eigenvectors of the rank-1 modified matrix.
+            ! Parts for multiplying with upper half of Q:
+
+            do i = 1, ncnt
+              j = idx(idxq1(i+ns))
+              ! Calculate the j-th eigenvector of the deflated system
+              ! See above why we are doing it this way!
+              tmp(1:nnzu) = d1u(1:nnzu)-dbase(j)
+              call v_add_s_&
+              &PRECISION&
+              &(obj,tmp,nnzu,ddiff(j))
+              ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
+            enddo
+
+            if(useGPU) then
+              !TODO: it should be enough to copy l_rows x ncnt
+              successCUDA = cuda_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
+                                 gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
+              check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
+
+              !TODO the previous loop could be possible to do on device and thus
+              !copy less
+              successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
+                                 gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
+              check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
+            endif
+
+            ! Multiply old Q with eigenvectors (upper half)
+
+            if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then
+              if (useGPU) then
+                call obj%timer%start("cublas")
+                call cublas_PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu,   &
+                                    1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1),    &
+                                    ev_dev, ubound(ev,dim=1), &
+                                    1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1))
+                call obj%timer%stop("cublas")
+              else
+                call obj%timer%start("blas")
+                call obj%timer%start("gemm")
+                call PRECISION_GEMM('N', 'N', int(l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), &
+                                    int(nnzu,kind=BLAS_KIND),   &
+                                    1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=BLAS_KIND),    &
+                                    ev, int(ubound(ev,dim=1),kind=BLAS_KIND), &
+                                    1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND))
+                call obj%timer%stop("gemm")
+                call obj%timer%stop("blas")
+              endif ! useGPU
+            endif
+
+            if(useGPU) then
+              !TODO: it should be enough to copy l_rows x ncnt
+              !TODO: actually this will be done after the second mutiplication
+
+              !TODO or actually maybe I should copy the half of the qtmp2 array
+              !here and the rest after the next gemm
+              !TODO either copy only half of the matrix here, and half after the
+              !second gemm, or copy whole array after the next gemm
+
+!              successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
+!                                 gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
+!              check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
+            endif
+
+            ! Compute eigenvectors of the rank-1 modified matrix.
+            ! Parts for multiplying with lower half of Q:
+
+            do i = 1, ncnt
+              j = idx(idxq1(i+ns))
+              ! Calculate the j-th eigenvector of the deflated system
+              ! See above why we are doing it this way!
+              tmp(1:nnzl) = d1l(1:nnzl)-dbase(j)
+              call v_add_s_&
+              &PRECISION&
+              &(obj,tmp,nnzl,ddiff(j))
+              ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
+            enddo
+
+            if(useGPU) then
+              !TODO the previous loop could be possible to do on device and thus
+              !copy less
+              successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
+                                 gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
+              check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
+            endif
+
+            ! Multiply old Q with eigenvectors (lower half)
+
+            if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then
+              if (useGPU) then
+                call obj%timer%start("cublas")
+                call cublas_PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl,   &
+                                    1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1),    &
+                                    ev_dev, ubound(ev,dim=1), &
+                                    1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1))
+                call obj%timer%stop("cublas")
+              else
+                call obj%timer%start("blas")
+                call obj%timer%start("gemm")
+                call PRECISION_GEMM('N', 'N', int(l_rows-l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND),  &
+                                     int(nnzl,kind=BLAS_KIND),   &
+                                     1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=BLAS_KIND),    &
+                                     ev,  int(ubound(ev,dim=1),kind=BLAS_KIND),   &
+                                     1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND))
+                call obj%timer%stop("gemm")
+                call obj%timer%stop("blas")
+              endif ! useGPU
+            endif
+
+            if(useGPU) then
+              !TODO either copy only half of the matrix here, and get rid of the
+              !previous copy or copy whole array here
+              successCUDA = cuda_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
+                                 gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
+              check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
+            endif
+
+             ! Put partial result into (output) Q
+
+            do i = 1, ncnt
+              q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
+            enddo
+
+          enddo   !ns = 0, nqcols1-1, max_strip ! strimining loop
+        enddo    !do np = 1, npc_n
+
+        deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"merge_systems: error when deallocating ev "//errorMessage
+          stop 1
+        endif
+
+        if(useGPU) then
+          successCUDA = cuda_free(qtmp1_dev)
+          check_dealloc_cuda("merge_systems: qtmp1_dev", successCUDA)
+          successCUDA = cuda_free(qtmp2_dev)
+          check_dealloc_cuda("merge_systems: qtmp2_dev", successCUDA)
+          successCUDA = cuda_free(ev_dev)
+          check_dealloc_cuda("merge_systems: ev_dev", successCUDA)
+        endif
+
+      endif !very outer test (na1==1 .or. na1==2) 
+#ifdef WITH_OPENMP
+      deallocate(z_p, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"merge_systems: error when deallocating z_p "//errorMessage
+        stop 1
+      endif
+#endif
+
+      call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
+
+      return
+
+      contains
+        subroutine add_tmp_&
+        &PRECISION&
+        &(obj, d1, dbase, ddiff, z, ev_scale_value, na1,i)
+          use precision
+    use elpa_abstract_impl
+          implicit none
+          class(elpa_abstract_impl_t), intent(inout) :: obj
+          integer(kind=ik), intent(in) :: na1, i
+
+          real(kind=REAL_DATATYPE), intent(in)    :: d1(:), dbase(:), ddiff(:), z(:)
+          real(kind=REAL_DATATYPE), intent(inout) :: ev_scale_value
+          real(kind=REAL_DATATYPE)                :: tmp(1:na1)
+
+               ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code
+               ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
+
+               ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
+               ! in exactly this order, but we want to prevent compiler optimization
+
+          tmp(1:na1) = d1(1:na1) -dbase(i)
+          call v_add_s_&
+          &PRECISION&
+          &(obj, tmp(1:na1),na1,ddiff(i))
+          tmp(1:na1) = z(1:na1) / tmp(1:na1)
+          ev_scale_value = 1.0_rk/sqrt(dot_product(tmp(1:na1),tmp(1:na1)))
+
+        end subroutine add_tmp_&
+        &PRECISION
+
+        subroutine resort_ev_&
+        &PRECISION&
+        &(obj, idx_ev, nLength)
+          use precision
+          use elpa_abstract_impl
+          implicit none
+          class(elpa_abstract_impl_t), intent(inout) :: obj
+          integer(kind=ik), intent(in) :: nLength
+          integer(kind=ik)             :: idx_ev(nLength)
+          integer(kind=ik)             :: i, nc, pc1, pc2, lc1, lc2, l_cols_out
+
+          real(kind=REAL_DATATYPE), allocatable   :: qtmp(:,:)
+          integer(kind=ik)             :: istat
+          character(200)               :: errorMessage
+
+          if (l_rows==0) return ! My processor column has no work to do
+
+          ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i))
+
+          l_cols_out = COUNT(p_col_out(1:na)==my_pcol)
+          allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"resort_ev: error when allocating qtmp "//errorMessage
+            stop 1
+          endif
+
+          nc = 0
+
+          do i=1,na
+
+            pc1 = p_col(idx_ev(i))
+            lc1 = l_col(idx_ev(i))
+            pc2 = p_col_out(i)
+
+            if (pc2<0) cycle ! This column is not needed in output
+
+            if (pc2==my_pcol) nc = nc+1 ! Counter for output columns
+
+            if (pc1==my_pcol) then
+              if (pc2==my_pcol) then
+                ! send and recieve column are local
+                qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
+              else
+#ifdef WITH_MPI
+                call obj%timer%start("mpi_communication")
+                call mpi_send(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, int(mod(i,4096),kind=MPI_KIND), &
+                              int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+                call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+              endif
+            else if (pc2==my_pcol) then
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call mpi_recv(qtmp(1,nc), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, int(mod(i,4096),kind=MPI_KIND), &
+                            int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+              call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+              qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
+#endif /* WITH_MPI */
+            endif
+          enddo
+
+          ! Insert qtmp into (output) q
+
+          nc = 0
+
+          do i=1,na
+
+            pc2 = p_col_out(i)
+            lc2 = l_col_out(i)
+
+            if (pc2==my_pcol) then
+              nc = nc+1
+              q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc)
+            endif
+          enddo
+
+          deallocate(qtmp, stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"resort_ev: error when deallocating qtmp "//errorMessage
+            stop 1
+          endif
+        end subroutine resort_ev_&
+        &PRECISION
+
+        subroutine transform_columns_&
+        &PRECISION&
+        &(obj, col1, col2)
+          use precision
+    use elpa_abstract_impl
+          implicit none
+          class(elpa_abstract_impl_t), intent(inout) :: obj
+
+          integer(kind=ik)           :: col1, col2
+          integer(kind=ik)           :: pc1, pc2, lc1, lc2
+
+          if (l_rows==0) return ! My processor column has no work to do
+
+          pc1 = p_col(col1)
+          lc1 = l_col(col1)
+          pc2 = p_col(col2)
+          lc2 = l_col(col2)
+
+          if (pc1==my_pcol) then
+            if (pc2==my_pcol) then
+              ! both columns are local
+              tmp(1:l_rows)      = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1)
+              q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
+              q(l_rqs:l_rqe,lc1) = tmp(1:l_rows)
+            else
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call mpi_sendrecv(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, &
+                                tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND,          &
+                                int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+              call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+              tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)
+#endif /* WITH_MPI */
+              q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1)
+            endif
+          else if (pc2==my_pcol) then
+#ifdef WITH_MPI
+            call obj%timer%start("mpi_communication")
+            call mpi_sendrecv(q(l_rqs,lc2), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, &
+                              tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND,          &
+                              int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+            call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+            tmp(1:l_rows) = q(l_rqs:l_rqe,lc2)
+#endif /* WITH_MPI */
+
+            q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
+          endif
+        end subroutine transform_columns_&
+        &PRECISION
+
+        subroutine global_gather_&
+        &PRECISION&
+        &(obj, z, n)
+          ! This routine sums up z over all processors.
+          ! It should only be used for gathering distributed results,
+          ! i.e. z(i) should be nonzero on exactly 1 processor column,
+          ! otherways the results may be numerically different on different columns
+          use precision
+          use elpa_abstract_impl
+          implicit none
+          class(elpa_abstract_impl_t), intent(inout) :: obj
+          integer(kind=ik)            :: n
+          real(kind=REAL_DATATYPE)    :: z(n)
+          real(kind=REAL_DATATYPE)    :: tmp(n)
+
+          if (npc_n==1 .and. np_rows==1) return ! nothing to do
+
+          ! Do an mpi_allreduce over processor rows
+#ifdef WITH_MPI
+          call obj%timer%start("mpi_communication")
+          call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+          call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+          tmp = z
+#endif /* WITH_MPI */
+          ! If only 1 processor column, we are done
+          if (npc_n==1) then
+            z(:) = tmp(:)
+            return
+          endif
+
+          ! If all processor columns are involved, we can use mpi_allreduce
+          if (npc_n==np_cols) then
+#ifdef WITH_MPI
+            call obj%timer%start("mpi_communication")
+            call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+            call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+            tmp = z
+#endif /* WITH_MPI */
+
+            return
+          endif
+
+          ! Do a ring send over processor columns
+          z(:) = 0
+          do np = 1, npc_n
+            z(:) = z(:) + tmp(:)
+#ifdef WITH_MPI
+            call obj%timer%start("mpi_communication")
+            call MPI_Sendrecv_replace(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_next,kind=MPI_KIND), 1111_MPI_KIND, &
+                                      int(np_prev,kind=MPI_KIND), 1111_MPI_KIND, &
+                                      int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+            call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+          enddo
+        end subroutine global_gather_&
+        &PRECISION
+
+        subroutine global_product_&
+        &PRECISION&
+        &(obj, z, n)
+          ! This routine calculates the global product of z.
+          use precision
+          use elpa_abstract_impl
+          implicit none
+          class(elpa_abstract_impl_t), intent(inout) :: obj
+
+
+          integer(kind=ik)            :: n
+          real(kind=REAL_DATATYPE)    :: z(n)
+
+          real(kind=REAL_DATATYPE)    :: tmp(n)
+
+          if (npc_n==1 .and. np_rows==1) return ! nothing to do
+
+          ! Do an mpi_allreduce over processor rows
+#ifdef WITH_MPI
+          call obj%timer%start("mpi_communication")
+          call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+          call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+          tmp = z
+#endif /* WITH_MPI */
+          ! If only 1 processor column, we are done
+          if (npc_n==1) then
+            z(:) = tmp(:)
+            return
+          endif
+
+          ! If all processor columns are involved, we can use mpi_allreduce
+          if (npc_n==np_cols) then
+#ifdef WITH_MPI
+            call obj%timer%start("mpi_communication")
+            call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+            call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+            z = tmp
+#endif /* WITH_MPI */
+            return
+          endif
+
+          ! We send all vectors to the first proc, do the product there
+          ! and redistribute the result.
+
+          if (my_pcol == npc_0) then
+            z(1:n) = tmp(1:n)
+            do np = npc_0+1, npc_0+npc_n-1
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call mpi_recv(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, &
+                            int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+              call obj%timer%stop("mpi_communication")
+#else  /* WITH_MPI */
+              tmp(1:n) = z(1:n)
+#endif  /* WITH_MPI */
+              z(1:n) = z(1:n)*tmp(1:n)
+            enddo
+            do np = npc_0+1, npc_0+npc_n-1
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call mpi_send(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, &
+                            int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+              call obj%timer%stop("mpi_communication")
+#endif  /* WITH_MPI */
+            enddo
+          else
+#ifdef WITH_MPI
+            call obj%timer%start("mpi_communication")
+            call mpi_send(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, &
+                          int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+            call mpi_recv(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, &
+                          int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+            call obj%timer%stop("mpi_communication")
+#else  /* WITH_MPI */
+            z(1:n) = tmp(1:n)
+#endif  /* WITH_MPI */
+
+          endif
+        end subroutine global_product_&
+        &PRECISION
+
+        subroutine check_monotony_&
+        &PRECISION&
+        &(obj, n,d,text, wantDebug, success)
+        ! This is a test routine for checking if the eigenvalues are monotonically increasing.
+        ! It is for debug purposes only, an error should never be triggered!
+          use precision
+          use elpa_abstract_impl
+          implicit none
+
+          class(elpa_abstract_impl_t), intent(inout) :: obj
+          integer(kind=ik)              :: n
+          real(kind=REAL_DATATYPE)      :: d(n)
+          character*(*)                 :: text
+
+          integer(kind=ik)              :: i
+          logical, intent(in)           :: wantDebug
+          logical, intent(out)          :: success
+
+          success = .true.
+          do i=1,n-1
+            if (d(i+1)<d(i)) then
+              if (wantDebug) write(error_unit,'(a,a,i8,2g25.17)') 'ELPA1_check_monotony: Monotony error on ',text,i,d(i),d(i+1)
+              success = .false.
+              return
+            endif
+          enddo
+        end subroutine check_monotony_&
+        &PRECISION
+
+    end subroutine merge_systems_&
+    &PRECISION
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_solve_tridi_real_template.F90 elpa-2019.11.001/src/elpa1/elpa1_solve_tridi_real_template.F90
--- elpa-2016.05.001/src/elpa1/elpa1_solve_tridi_real_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_solve_tridi_real_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,735 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#include "../general/sanity.F90"
+
+subroutine solve_tridi_&
+&PRECISION_AND_SUFFIX &
+    ( obj, na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
+                                           mpi_comm_cols, useGPU, wantDebug, success, max_threads )
+
+      use precision
+      use elpa_abstract_impl
+      implicit none
+#include "../../src/general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)               :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
+      real(kind=REAL_DATATYPE), intent(inout)    :: d(na), e(na)
+#ifdef USE_ASSUMED_SIZE
+      real(kind=REAL_DATATYPE), intent(inout)    :: q(ldq,*)
+#else
+      real(kind=REAL_DATATYPE), intent(inout)    :: q(ldq,matrixCols)
+#endif
+      logical, intent(in)                        :: useGPU, wantDebug
+      logical, intent(out)                       :: success
+
+      integer(kind=ik)                           :: i, j, n, np, nc, nev1, l_cols, l_rows
+      integer(kind=ik)                           :: my_prow, my_pcol, np_rows, np_cols
+      integer(kind=MPI_KIND)                     :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+      integer(kind=ik), allocatable              :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
+
+      integer(kind=ik)                           :: istat
+      character(200)                             :: errorMessage
+      character(20)                              :: gpuString
+      integer(kind=ik), intent(in)               :: max_threads
+
+      if(useGPU) then
+        gpuString = "_gpu"
+      else
+        gpuString = ""
+      endif
+
+      call obj%timer%start("solve_tridi" // PRECISION_SUFFIX // gpuString)
+
+      call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
+
+      my_prow = int(my_prowMPI,kind=c_int)
+      np_rows = int(np_rowsMPI,kind=c_int)
+      my_pcol = int(my_pcolMPI,kind=c_int)
+      np_cols = int(np_colsMPI,kind=c_int)
+
+      call obj%timer%stop("mpi_communication")
+
+      success = .true.
+
+      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
+      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
+
+      ! Set Q to 0
+      q(1:l_rows, 1:l_cols) = 0.0_rk
+
+      ! Get the limits of the subdivisons, each subdivison has as many cols
+      ! as fit on the respective processor column
+
+      allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi: error when allocating limits "//errorMessage
+        stop 1
+      endif
+
+      limits(0) = 0
+      do np=0,np_cols-1
+        nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
+
+        ! Check for the case that a column has have zero width.
+        ! This is not supported!
+        ! Scalapack supports it but delivers no results for these columns,
+        ! which is rather annoying
+        if (nc==0) then
+          call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX)
+          if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
+          success = .false.
+          return
+        endif
+        limits(np+1) = limits(np) + nc
+      enddo
+
+      ! Subdivide matrix by subtracting rank 1 modifications
+
+      do i=1,np_cols-1
+        n = limits(i)
+        d(n) = d(n)-abs(e(n))
+        d(n+1) = d(n+1)-abs(e(n))
+      enddo
+
+      ! Solve sub problems on processsor columns
+
+      nc = limits(my_pcol) ! column after which my problem starts
+
+      if (np_cols>1) then
+        nev1 = l_cols ! all eigenvectors are needed
+      else
+        nev1 = MIN(nev,l_cols)
+      endif
+      call solve_tridi_col_&
+           &PRECISION_AND_SUFFIX &
+             (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk,  &
+                        matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads)
+      if (.not.(success)) then
+        call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
+        return
+      endif
+      ! If there is only 1 processor column, we are done
+
+      if (np_cols==1) then
+        deallocate(limits, stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"solve_tridi: error when deallocating limits "//errorMessage
+          stop 1
+        endif
+
+        call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
+        return
+      endif
+
+      ! Set index arrays for Q columns
+
+      ! Dense distribution scheme:
+
+      allocate(l_col(na), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi: error when allocating l_col "//errorMessage
+        stop 1
+      endif
+
+      allocate(p_col(na), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi: error when allocating p_col "//errorMessage
+        stop 1
+      endif
+
+      n = 0
+      do np=0,np_cols-1
+        nc = local_index(na, np, np_cols, nblk, -1)
+        do i=1,nc
+          n = n+1
+          l_col(n) = i
+          p_col(n) = np
+        enddo
+      enddo
+
+      ! Block cyclic distribution scheme, only nev columns are set:
+
+      allocate(l_col_bc(na), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi: error when allocating l_col_bc "//errorMessage
+        stop 1
+      endif
+
+      allocate(p_col_bc(na), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi: error when allocating p_col_bc "//errorMessage
+        stop 1
+      endif
+
+      p_col_bc(:) = -1
+      l_col_bc(:) = -1
+
+      do i = 0, na-1, nblk*np_cols
+        do j = 0, np_cols-1
+          do n = 1, nblk
+            if (i+j*nblk+n <= MIN(nev,na)) then
+              p_col_bc(i+j*nblk+n) = j
+              l_col_bc(i+j*nblk+n) = i/np_cols + n
+             endif
+           enddo
+         enddo
+      enddo
+
+      ! Recursively merge sub problems
+      call merge_recursive_&
+           &PRECISION &
+           (obj, 0, np_cols, useGPU, wantDebug, success)
+      if (.not.(success)) then
+        call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
+        return
+      endif
+
+      deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi: error when deallocating l_col "//errorMessage
+        stop 1
+      endif
+
+      call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString)
+      return
+
+      contains
+        recursive subroutine merge_recursive_&
+                  &PRECISION &
+           (obj, np_off, nprocs, useGPU, wantDebug, success)
+           use precision
+           use elpa_abstract_impl
+           implicit none
+
+           ! noff is always a multiple of nblk_ev
+           ! nlen-noff is always > nblk_ev
+
+           class(elpa_abstract_impl_t), intent(inout) :: obj
+           integer(kind=ik)     :: np_off, nprocs
+           integer(kind=ik)     :: np1, np2, noff, nlen, nmid, n
+#ifdef WITH_MPI
+!           integer(kind=ik)     :: my_mpi_status(mpi_status_size)
+#endif
+           logical, intent(in)  :: useGPU, wantDebug
+           logical, intent(out) :: success
+
+           success = .true.
+
+           if (nprocs<=1) then
+             ! Safety check only
+             if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
+             success = .false.
+             return
+           endif
+           ! Split problem into 2 subproblems of size np1 / np2
+
+           np1 = nprocs/2
+           np2 = nprocs-np1
+
+           if (np1 > 1) call merge_recursive_&
+                        &PRECISION &
+           (obj, np_off, np1, useGPU, wantDebug, success)
+           if (.not.(success)) return
+           if (np2 > 1) call merge_recursive_&
+                        &PRECISION &
+           (obj, np_off+np1, np2, useGPU, wantDebug, success)
+           if (.not.(success)) return
+
+           noff = limits(np_off)
+           nmid = limits(np_off+np1) - noff
+           nlen = limits(np_off+nprocs) - noff
+
+#ifdef WITH_MPI
+           call obj%timer%start("mpi_communication")
+           if (my_pcol==np_off) then
+             do n=np_off+np1,np_off+nprocs-1
+               call mpi_send(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), 1_MPI_KIND, &
+                             int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+             enddo
+           endif
+           call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+           if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
+#ifdef WITH_MPI
+             call obj%timer%start("mpi_communication")
+             call mpi_recv(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_off,kind=MPI_KIND), 1_MPI_KIND, &
+                           int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+             call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+!             d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
+#endif /* WITH_MPI */
+           endif
+
+           if (my_pcol==np_off+np1) then
+             do n=np_off,np_off+np1-1
+#ifdef WITH_MPI
+               call obj%timer%start("mpi_communication")
+               call mpi_send(d(noff+nmid+1), int(nlen-nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), &
+                             1_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+               call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+             enddo
+           endif
+           if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
+#ifdef WITH_MPI
+             call obj%timer%start("mpi_communication")
+             call mpi_recv(d(noff+nmid+1), int(nlen-nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_off+np1,kind=MPI_KIND), &
+                           1_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+             call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+!             d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
+#endif /* WITH_MPI */
+           endif
+           if (nprocs == np_cols) then
+
+             ! Last merge, result distribution must be block cyclic, noff==0,
+             ! p_col_bc is set so that only nev eigenvalues are calculated
+             call merge_systems_&
+                  &PRECISION &
+                                 (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
+                                 nblk, matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
+                                 l_col, p_col, &
+                                 l_col_bc, p_col_bc, np_off, nprocs, useGPU, wantDebug, success, max_threads )
+             if (.not.(success)) return
+           else
+             ! Not last merge, leave dense column distribution
+             call merge_systems_&
+                  &PRECISION &
+                                (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
+                                 nblk, matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_cols,kind=ik), &
+                                 l_col(noff+1), p_col(noff+1), &
+                                 l_col(noff+1), p_col(noff+1), np_off, nprocs, useGPU, wantDebug, success, max_threads )
+             if (.not.(success)) return
+           endif
+       end subroutine merge_recursive_&
+           &PRECISION
+
+    end subroutine solve_tridi_&
+        &PRECISION_AND_SUFFIX
+
+    subroutine solve_tridi_col_&
+    &PRECISION_AND_SUFFIX &
+      ( obj, na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads )
+
+   ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
+   ! with the divide and conquer method.
+   ! Works best if the number of processor rows is a power of 2!
+      use precision
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+
+      integer(kind=ik)              :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
+      real(kind=REAL_DATATYPE)      :: d(na), e(na)
+#ifdef USE_ASSUMED_SIZE
+      real(kind=REAL_DATATYPE)      :: q(ldq,*)
+#else
+      real(kind=REAL_DATATYPE)      :: q(ldq,matrixCols)
+#endif
+
+      integer(kind=ik), parameter   :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
+
+      real(kind=REAL_DATATYPE), allocatable    :: qmat1(:,:), qmat2(:,:)
+      integer(kind=ik)              :: i, n, np
+      integer(kind=ik)              :: ndiv, noff, nmid, nlen, max_size
+      integer(kind=ik)              :: my_prow, np_rows
+      integer(kind=MPI_KIND)        :: mpierr, my_prowMPI, np_rowsMPI
+
+      integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
+      logical, intent(in)           :: useGPU, wantDebug
+      logical, intent(out)          :: success
+      integer(kind=ik)              :: istat
+      character(200)                :: errorMessage
+
+      integer(kind=ik), intent(in)  :: max_threads
+
+      call obj%timer%start("solve_tridi_col" // PRECISION_SUFFIX)
+      call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
+
+      my_prow = int(my_prowMPI,kind=c_int)
+      np_rows = int(np_rowsMPI,kind=c_int)
+      call obj%timer%stop("mpi_communication")
+      success = .true.
+      ! Calculate the number of subdivisions needed.
+
+      n = na
+      ndiv = 1
+      do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
+        n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
+        ndiv = ndiv*2
+      enddo
+
+      ! If there is only 1 processor row and not all eigenvectors are needed
+      ! and the matrix size is big enough, then use 2 subdivisions
+      ! so that merge_systems is called once and only the needed
+      ! eigenvectors are calculated for the final problem.
+
+      if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
+
+      allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi_col: error when allocating limits "//errorMessage
+        stop 1
+      endif
+
+      limits(0) = 0
+      limits(ndiv) = na
+
+      n = ndiv
+      do while(n>1)
+        n = n/2 ! n is always a power of 2
+        do i=0,ndiv-1,2*n
+          ! We want to have even boundaries (for cache line alignments)
+          limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
+        enddo
+      enddo
+
+      ! Calculate the maximum size of a subproblem
+
+      max_size = 0
+      do i=1,ndiv
+        max_size = MAX(max_size,limits(i)-limits(i-1))
+      enddo
+
+      ! Subdivide matrix by subtracting rank 1 modifications
+
+      do i=1,ndiv-1
+        n = limits(i)
+        d(n) = d(n)-abs(e(n))
+        d(n+1) = d(n+1)-abs(e(n))
+      enddo
+
+      if (np_rows==1)    then
+
+        ! For 1 processor row there may be 1 or 2 subdivisions
+        do n=0,ndiv-1
+          noff = limits(n)        ! Start of subproblem
+          nlen = limits(n+1)-noff ! Size of subproblem
+
+          call solve_tridi_single_problem_&
+          &PRECISION_AND_SUFFIX &
+                                  (obj, nlen,d(noff+1),e(noff+1), &
+                                    q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
+
+          if (.not.(success)) return
+        enddo
+
+      else
+
+        ! Solve sub problems in parallel with solve_tridi_single
+        ! There is at maximum 1 subproblem per processor
+
+        allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"solve_tridi_col: error when allocating qmat1 "//errorMessage
+          stop 1
+        endif
+
+        allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"solve_tridi_col: error when allocating qmat2 "//errorMessage
+          stop 1
+        endif
+
+        qmat1 = 0 ! Make sure that all elements are defined
+
+        if (my_prow < ndiv) then
+
+          noff = limits(my_prow)        ! Start of subproblem
+          nlen = limits(my_prow+1)-noff ! Size of subproblem
+          call solve_tridi_single_problem_&
+          &PRECISION_AND_SUFFIX &
+                                    (obj, nlen,d(noff+1),e(noff+1),qmat1, &
+                                    ubound(qmat1,dim=1), wantDebug, success)
+
+          if (.not.(success)) return
+        endif
+
+        ! Fill eigenvectors in qmat1 into global matrix q
+
+        do np = 0, ndiv-1
+
+          noff = limits(np)
+          nlen = limits(np+1)-noff
+#ifdef WITH_MPI
+          call obj%timer%start("mpi_communication")
+          call MPI_Bcast(d(noff+1), int(nlen,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), &
+                         int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+          qmat2 = qmat1
+          call MPI_Bcast(qmat2, int(max_size*max_size,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), &
+                         int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+          call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+!          qmat2 = qmat1 ! is this correct
+#endif /* WITH_MPI */
+          do i=1,nlen
+
+#ifdef WITH_MPI
+            call distribute_global_column_&
+            &PRECISION &
+                     (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
+#else /* WITH_MPI */
+            call distribute_global_column_&
+            &PRECISION &
+                     (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
+#endif /* WITH_MPI */
+          enddo
+
+        enddo
+
+        deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"solve_tridi_col: error when deallocating qmat2 "//errorMessage
+          stop 1
+        endif
+
+      endif
+
+      ! Allocate and set index arrays l_col and p_col
+
+      allocate(l_col(na), p_col_i(na),  p_col_o(na), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi_col: error when allocating l_col "//errorMessage
+        stop 1
+      endif
+
+      do i=1,na
+        l_col(i) = i
+        p_col_i(i) = 0
+        p_col_o(i) = 0
+      enddo
+
+      ! Merge subproblems
+
+      n = 1
+      do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single
+
+        do i=0,ndiv-1,2*n
+
+          noff = limits(i)
+          nmid = limits(i+n) - noff
+          nlen = limits(i+2*n) - noff
+
+          if (nlen == na) then
+            ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
+            p_col_o(nev+1:na) = -1
+          endif
+          call merge_systems_&
+          &PRECISION &
+                              (obj, nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
+                               matrixCols, int(mpi_comm_rows,kind=ik), int(mpi_comm_self,kind=ik), &
+                               l_col(noff+1), p_col_i(noff+1), &
+                               l_col(noff+1), p_col_o(noff+1), 0, 1, useGPU, wantDebug, success, max_threads)
+          if (.not.(success)) return
+
+        enddo
+
+        n = 2*n
+
+      enddo
+
+      deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_tridi_col: error when deallocating l_col "//errorMessage
+        stop 1
+      endif
+
+      call obj%timer%stop("solve_tridi_col" // PRECISION_SUFFIX)
+
+    end subroutine solve_tridi_col_&
+    &PRECISION_AND_SUFFIX
+
+    recursive subroutine solve_tridi_single_problem_&
+    &PRECISION_AND_SUFFIX &
+    (obj, nlen, d, e, q, ldq, wantDebug, success)
+
+   ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
+   ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
+     use precision
+     use elpa_abstract_impl
+     use elpa_blas_interfaces
+
+     implicit none
+     class(elpa_abstract_impl_t), intent(inout) :: obj
+     integer(kind=ik)                         :: nlen, ldq
+     real(kind=REAL_DATATYPE)                 :: d(nlen), e(nlen), q(ldq,nlen)
+
+     real(kind=REAL_DATATYPE), allocatable    :: work(:), qtmp(:), ds(:), es(:)
+     real(kind=REAL_DATATYPE)                 :: dtmp
+
+     integer(kind=ik)              :: i, j, lwork, liwork, info
+     integer(kind=BLAS_KIND)       :: infoBLAS
+     integer(kind=ik), allocatable :: iwork(:)
+
+     logical, intent(in)           :: wantDebug
+     logical, intent(out)          :: success
+      integer(kind=ik)             :: istat
+      character(200)               :: errorMessage
+
+     call obj%timer%start("solve_tridi_single" // PRECISION_SUFFIX)
+
+     success = .true.
+     allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"solve_tridi_single: error when allocating ds "//errorMessage
+       stop 1
+     endif
+
+     ! Save d and e for the case that dstedc fails
+
+     ds(:) = d(:)
+     es(:) = e(:)
+
+     ! First try dstedc, this is normally faster but it may fail sometimes (why???)
+
+     lwork = 1 + 4*nlen + nlen**2
+     liwork =  3 + 5*nlen
+     allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"solve_tridi_single: error when allocating work "//errorMessage
+       stop 1
+     endif
+     call obj%timer%start("blas")
+     call PRECISION_STEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND),    &
+                          work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), &
+                          infoBLAS)
+     info = int(infoBLAS,kind=ik)
+     call obj%timer%stop("blas")
+
+     if (info /= 0) then
+
+       ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.
+
+       write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'
+
+       d(:) = ds(:)
+       e(:) = es(:)
+       call obj%timer%start("blas")
+       call PRECISION_STEQR('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), work, infoBLAS )
+       info = int(infoBLAS,kind=ik)
+       call obj%timer%stop("blas")
+
+       ! If DSTEQR fails also, we don't know what to do further ...
+
+       if (info /= 0) then
+         if (wantDebug) &
+           write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
+           success = .false.
+           return
+         endif
+       end if
+
+       deallocate(work,iwork,ds,es, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"solve_tridi_single: error when deallocating ds "//errorMessage
+         stop 1
+       endif
+
+      ! Check if eigenvalues are monotonically increasing
+      ! This seems to be not always the case  (in the IBM implementation of dstedc ???)
+
+      do i=1,nlen-1
+        if (d(i+1)<d(i)) then
+#ifdef DOUBLE_PRECISION_REAL
+          if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then
+#else
+          if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then
+#endif
+            write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
+          else
+            write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
+            write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
+            write(error_unit,'(a)') 'In this extent, this is completely harmless.'
+            write(error_unit,'(a)') 'Still, we keep this info message just in case.'
+          end if
+          allocate(qtmp(nlen), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"solve_tridi_single: error when allocating qtmp "//errorMessage
+            stop 1
+          endif
+
+          dtmp = d(i+1)
+          qtmp(1:nlen) = q(1:nlen,i+1)
+          do j=i,1,-1
+            if (dtmp<d(j)) then
+              d(j+1)        = d(j)
+              q(1:nlen,j+1) = q(1:nlen,j)
+            else
+              exit ! Loop
+            endif
+          enddo
+          d(j+1)        = dtmp
+          q(1:nlen,j+1) = qtmp(1:nlen)
+          deallocate(qtmp, stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"solve_tridi_single: error when deallocating qtmp "//errorMessage
+            stop 1
+          endif
+
+       endif
+     enddo
+     call obj%timer%stop("solve_tridi_single" // PRECISION_SUFFIX)
+
+    end subroutine solve_tridi_single_problem_&
+    &PRECISION_AND_SUFFIX
+
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_template.F90 elpa-2019.11.001/src/elpa1/elpa1_template.F90
--- elpa-2016.05.001/src/elpa1/elpa1_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,546 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#include "../general/sanity.F90"
+
+function elpa_solve_evp_&
+         &MATH_DATATYPE&
+   &_1stage_&
+   &PRECISION&
+   &_impl (obj, a, ev, q) result(success)
+   use precision
+   use cuda_functions
+   use mod_check_for_gpu
+   use iso_c_binding
+   use elpa_abstract_impl
+   use elpa_mpi
+   use elpa1_compute
+   use elpa_omp
+
+   implicit none
+#include "../general/precision_kinds.F90"
+   class(elpa_abstract_impl_t), intent(inout) :: obj
+   real(kind=REAL_DATATYPE), intent(out)           :: ev(obj%na)
+
+#ifdef USE_ASSUMED_SIZE
+   MATH_DATATYPE(kind=rck), intent(inout)       :: a(obj%local_nrows,*)
+   MATH_DATATYPE(kind=rck), optional,target,intent(out)  :: q(obj%local_nrows,*)
+#else
+   MATH_DATATYPE(kind=rck), intent(inout)       :: a(obj%local_nrows,obj%local_ncols)
+#ifdef HAVE_SKEWSYMMETRIC
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,2*obj%local_ncols)
+#else
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
+#endif
+#endif
+
+#if REALCASE == 1
+   real(kind=C_DATATYPE_KIND), allocatable         :: tau(:)
+   real(kind=C_DATATYPE_KIND), allocatable, target         :: q_dummy(:,:)
+   real(kind=C_DATATYPE_KIND), pointer             :: q_actual(:,:)
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+   real(kind=REAL_DATATYPE), allocatable           :: q_real(:,:)
+   complex(kind=C_DATATYPE_KIND), allocatable      :: tau(:)
+   complex(kind=C_DATATYPE_KIND), allocatable,target :: q_dummy(:,:)
+   complex(kind=C_DATATYPE_KIND), pointer          :: q_actual(:,:)
+#endif /* COMPLEXCASE */
+
+
+   integer(kind=c_int)                             :: l_cols, l_rows, l_cols_nev, np_rows, np_cols
+   integer(kind=MPI_KIND)                          :: np_rowsMPI, np_colsMPI
+
+   logical                                         :: useGPU
+   integer(kind=c_int)                             :: skewsymmetric
+   logical                                         :: isSkewsymmetric
+   logical                                         :: success
+
+   logical                                         :: do_useGPU, do_useGPU_tridiag, &
+                                                      do_useGPU_solve_tridi, do_useGPU_trans_ev
+   integer(kind=ik)                                :: numberOfGPUDevices
+
+   integer(kind=c_int)                             :: my_pe, n_pes, my_prow, my_pcol
+   integer(kind=MPI_KIND)                          :: mpierr, my_peMPI, n_pesMPI, my_prowMPI, my_pcolMPI
+   real(kind=C_DATATYPE_KIND), allocatable         :: e(:)
+   logical                                         :: wantDebug
+   integer(kind=c_int)                             :: istat, debug, gpu
+   character(200)                                  :: errorMessage
+   integer(kind=ik)                                :: na, nev, lda, ldq, nblk, matrixCols, &
+                                                      mpi_comm_rows, mpi_comm_cols,        &
+                                                      mpi_comm_all, check_pd, i, error
+
+   logical                                         :: do_tridiag, do_solve, do_trans_ev
+   integer(kind=ik)                                :: nrThreads
+   integer(kind=ik)                                :: global_index
+   
+   call obj%timer%start("elpa_solve_evp_&
+   &MATH_DATATYPE&
+   &_1stage_&
+   &PRECISION&
+   &")
+
+#ifdef WITH_OPENMP
+   ! store the number of OpenMP threads used in the calling function
+   ! restore this at the end of ELPA 2
+   omp_threads_caller = omp_get_max_threads()
+
+   ! check the number of threads that ELPA should use internally
+   call obj%get("omp_threads",nrThreads,error)
+   call omp_set_num_threads(nrThreads)
+#else
+   nrThreads = 1
+#endif
+#ifdef WITH_NVTX
+   call nvtxRangePush("elpa1")
+#endif
+
+
+   success = .true.
+
+   if (present(q)) then
+     obj%eigenvalues_only = .false.
+   else
+     obj%eigenvalues_only = .true.
+   endif
+
+   na         = obj%na
+   nev        = obj%nev
+   lda        = obj%local_nrows
+   ldq        = obj%local_nrows
+   nblk       = obj%nblk
+   matrixCols = obj%local_ncols
+
+   ! special case na = 1
+   if (na .eq. 1) then
+#if REALCASE == 1
+     ev(1) = a(1,1)
+#endif
+#if COMPLEXCASE == 1
+     ev(1) = real(a(1,1))
+#endif
+     if (.not.(obj%eigenvalues_only)) then
+       q(1,1) = ONE
+     endif
+
+     ! restore original OpenMP settings
+#ifdef WITH_OPENMP
+     ! store the number of OpenMP threads used in the calling function
+     ! restore this at the end of ELPA 2
+     call omp_set_num_threads(omp_threads_caller)
+#endif
+     call obj%timer%stop("elpa_solve_evp_&
+     &MATH_DATATYPE&
+     &_1stage_&
+     &PRECISION&
+     &")
+     return
+   endif
+
+   if (nev == 0) then
+     nev = 1
+     obj%eigenvalues_only = .true.
+   endif
+
+
+   call obj%get("mpi_comm_rows",mpi_comm_rows,error)
+   if (error .ne. ELPA_OK) then
+     print *,"Problem getting option. Aborting..."
+     stop
+   endif
+   call obj%get("mpi_comm_cols",mpi_comm_cols,error)
+   if (error .ne. ELPA_OK) then
+     print *,"Problem getting option. Aborting..."
+     stop
+   endif
+
+   call obj%get("gpu",gpu,error)
+   if (error .ne. ELPA_OK) then
+     print *,"Problem getting option. Aborting..."
+     stop
+   endif
+   if (gpu .eq. 1) then
+     useGPU =.true.
+   else
+     useGPU = .false.
+   endif
+   
+   call obj%get("is_skewsymmetric",skewsymmetric,error)
+   if (error .ne. ELPA_OK) then
+     print *,"Problem getting option. Aborting..."
+     stop
+   endif
+    
+   isSkewsymmetric = (skewsymmetric == 1)
+   
+   call obj%timer%start("mpi_communication")
+
+   call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
+   call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
+
+   my_prow = int(my_prowMPI,kind=c_int)
+   my_pcol = int(my_pcolMPI,kind=c_int)
+
+   call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
+   call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
+
+   np_rows = int(np_rowsMPI,kind=c_int)
+   np_cols = int(np_colsMPI,kind=c_int)
+
+   call obj%timer%stop("mpi_communication")
+
+   call obj%get("debug", debug,error)
+   if (error .ne. ELPA_OK) then
+     print *,"Problem setting option. Aborting..."
+     stop
+   endif
+   wantDebug = debug == 1
+   do_useGPU = .false.
+
+   
+   if (useGPU) then
+     call obj%timer%start("check_for_gpu")
+     call obj%get("mpi_comm_parent", mpi_comm_all,error)
+     if (error .ne. ELPA_OK) then
+       print *,"Problem getting option. Aborting..."
+       stop
+     endif
+     call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
+     my_pe = int(my_peMPI,kind=c_int)
+
+     if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
+       do_useGPU = .true.
+       ! set the neccessary parameters
+       cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
+       cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
+       cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
+       cudaHostRegisterPortable = cuda_hostRegisterPortable()
+       cudaHostRegisterMapped   = cuda_hostRegisterMapped()
+     else
+       print *,"GPUs are requested but not detected! Aborting..."
+       success = .false.
+       return
+     endif
+     call obj%timer%stop("check_for_gpu")
+   endif
+
+
+   do_useGPU_tridiag = do_useGPU
+   do_useGPU_solve_tridi = do_useGPU
+   do_useGPU_trans_ev = do_useGPU
+   ! only if we want (and can) use GPU in general, look what are the
+   ! requirements for individual routines. Implicitly they are all set to 1, so
+   ! unles specified otherwise by the user, GPU versions of all individual
+   ! routines should be used
+   if(do_useGPU) then
+     call obj%get("gpu_tridiag", gpu, error)
+     if (error .ne. ELPA_OK) then
+       print *,"Problem getting option. Aborting..."
+       stop
+     endif
+     do_useGPU_tridiag = (gpu == 1)
+
+     call obj%get("gpu_solve_tridi", gpu, error)
+     if (error .ne. ELPA_OK) then
+       print *,"Problem getting option. Aborting..."
+       stop
+     endif
+     do_useGPU_solve_tridi = (gpu == 1)
+
+     call obj%get("gpu_trans_ev", gpu, error)
+     if (error .ne. ELPA_OK) then
+       print *,"Problem getting option. Aborting..."
+       stop
+     endif
+     do_useGPU_trans_ev = (gpu == 1)
+   endif
+   ! for elpa1 the easy thing is, that the individual phases of the algorithm
+   ! do not share any data on the GPU. 
+
+
+   ! allocate a dummy q_intern, if eigenvectors should not be commputed and thus q is NOT present
+   if (.not.(obj%eigenvalues_only)) then
+     q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
+   else
+     allocate(q_dummy(obj%local_nrows,obj%local_ncols))
+     q_actual => q_dummy
+   endif
+
+#if COMPLEXCASE == 1
+   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
+   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
+
+   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
+
+   allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
+   if (istat .ne. 0) then
+     print *,"solve_evp_&
+     &MATH_DATATYPE&
+     &_1stage_&
+     &PRECISION&
+     &" // ": error when allocating q_real "//errorMessage
+     stop 1
+   endif
+#endif
+   allocate(e(na), tau(na), stat=istat, errmsg=errorMessage)
+   if (istat .ne. 0) then
+     print *,"solve_evp_&
+     &MATH_DATATYPE&
+     &_1stage_&
+     &PRECISION&
+     &" // ": error when allocating e, tau "//errorMessage
+     stop 1
+   endif
+
+
+   ! start the computations
+   ! as default do all three steps (this might change at some point)
+   do_tridiag  = .true.
+   do_solve    = .true.
+   do_trans_ev = .true.
+
+   if (do_tridiag) then
+     call obj%timer%start("forward")
+#ifdef HAVE_LIKWID
+     call likwid_markerStartRegion("tridi")
+#endif
+#ifdef WITH_NVTX
+     call nvtxRangePush("tridi")
+#endif
+
+     call tridiag_&
+     &MATH_DATATYPE&
+     &_&
+     &PRECISION&
+     & (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads)
+
+#ifdef WITH_NVTX
+     call nvtxRangePop()
+#endif
+#ifdef HAVE_LIKWID
+     call likwid_markerStopRegion("tridi")
+#endif
+     call obj%timer%stop("forward")
+    endif  !do_tridiag
+
+    if (do_solve) then
+     call obj%timer%start("solve")
+#ifdef HAVE_LIKWID
+     call likwid_markerStartRegion("solve")
+#endif
+#ifdef WITH_NVTX
+     call nvtxRangePush("solve")
+#endif
+
+     call solve_tridi_&
+     &PRECISION&
+     & (obj, na, nev, ev, e,  &
+#if REALCASE == 1
+        q_actual, ldq,          &
+#endif
+#if COMPLEXCASE == 1
+        q_real, l_rows,  &
+#endif
+        nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads)
+
+#ifdef WITH_NVTX
+     call nvtxRangePop()
+#endif
+#ifdef HAVE_LIKWID
+     call likwid_markerStopRegion("solve")
+#endif
+     call obj%timer%stop("solve")
+     if (.not.(success)) return
+   endif !do_solve
+
+   if (obj%eigenvalues_only) then
+     do_trans_ev = .false.
+   else
+     call obj%get("check_pd",check_pd,error)
+     if (error .ne. ELPA_OK) then
+       print *,"Problem setting option. Aborting..."
+       stop
+     endif
+     if (check_pd .eq. 1) then
+       check_pd = 0
+       do i = 1, na
+         if (ev(i) .gt. THRESHOLD) then
+           check_pd = check_pd + 1
+         endif
+       enddo
+       if (check_pd .lt. na) then
+         ! not positiv definite => eigenvectors needed
+         do_trans_ev = .true.
+       else
+         do_trans_ev = .false.
+       endif
+     endif ! check_pd
+   endif ! eigenvalues_only
+
+   if (do_trans_ev) then
+    ! q must be given thats why from here on we can use q and not q_actual
+#if COMPLEXCASE == 1
+     q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
+#endif
+     if (isSkewsymmetric) then
+     ! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
+     ! This makes the eigenvectors complex. 
+     ! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
+       q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols) = 0.0
+       do i = 1, obj%local_nrows
+!        global_index = indxl2g(i, nblk, my_prow, 0, np_rows)
+         global_index = np_rows*nblk*((i-1)/nblk) + MOD(i-1,nblk) + MOD(np_rows+my_prow-0, np_rows)*nblk + 1
+         if (mod(global_index-1,4) .eq. 0) then
+            ! do nothing
+         end if
+         if (mod(global_index-1,4) .eq. 1) then
+            q(i,obj%local_ncols+1:2*obj%local_ncols) = q(i,1:obj%local_ncols)
+            q(i,1:obj%local_ncols) = 0
+         end if
+         if (mod(global_index-1,4) .eq. 2) then
+            q(i,1:obj%local_ncols) = -q(i,1:obj%local_ncols)
+         end if
+         if (mod(global_index-1,4) .eq. 3) then
+            q(i,obj%local_ncols+1:2*obj%local_ncols) = -q(i,1:obj%local_ncols)
+            q(i,1:obj%local_ncols) = 0
+         end if
+       end do       
+     endif
+
+     call obj%timer%start("back")
+#ifdef HAVE_LIKWID
+     call likwid_markerStartRegion("trans_ev")
+#endif
+#ifdef WITH_NVTX
+     call nvtxRangePush("trans_ev")
+#endif
+
+     ! In the skew-symmetric case this transforms the real part
+     call trans_ev_&
+     &MATH_DATATYPE&
+     &_&
+     &PRECISION&
+     & (obj, na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
+     if (isSkewsymmetric) then
+       ! Transform imaginary part
+       ! Transformation of real and imaginary part could also be one call of trans_ev_tridi acting on the n x 2n matrix.
+       call trans_ev_&
+             &MATH_DATATYPE&
+             &_&
+             &PRECISION&
+             & (obj, na, nev, a, lda, tau, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), ldq, nblk, matrixCols, &
+                mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
+       endif
+
+#ifdef WITH_NVTX
+     call nvtxRangePop()
+#endif
+#ifdef HAVE_LIKWID
+     call likwid_markerStopRegion("trans_ev")
+#endif
+     call obj%timer%stop("back")
+   endif ! do_trans_ev
+
+#if COMPLEXCASE == 1
+    deallocate(q_real, stat=istat, errmsg=errorMessage)
+    if (istat .ne. 0) then
+      print *,"solve_evp_&
+      &MATH_DATATYPE&
+      &_1stage_&
+      &PRECISION&
+      &" // ": error when deallocating q_real "//errorMessage
+      stop 1
+    endif
+#endif
+
+   deallocate(e, tau, stat=istat, errmsg=errorMessage)
+   if (istat .ne. 0) then
+     print *,"solve_evp_&
+     &MATH_DATATYPE&
+     &_1stage_&
+     &PRECISION&
+     &" // ": error when deallocating e, tau "//errorMessage
+     stop 1
+   endif
+
+   if (obj%eigenvalues_only) then
+     deallocate(q_dummy, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"solve_evp_&
+       &MATH_DATATYPE&
+       &_1stage_&
+       &PRECISION&
+       &" // ": error when deallocating q_dummy "//errorMessage
+       stop 1
+     endif
+   endif
+
+#ifdef WITH_NVTX
+   call nvtxRangePop()
+#endif
+   ! restore original OpenMP settings
+#ifdef WITH_OPENMP
+   ! store the number of OpenMP threads used in the calling function
+   ! restore this at the end of ELPA 2
+   call omp_set_num_threads(omp_threads_caller)
+#endif
+
+   call obj%timer%stop("elpa_solve_evp_&
+   &MATH_DATATYPE&
+   &_1stage_&
+   &PRECISION&
+   &")
+end function
+
+
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_tools_template.F90 elpa-2019.11.001/src/elpa1/elpa1_tools_template.F90
--- elpa-2016.05.001/src/elpa1/elpa1_tools_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_tools_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,361 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#include "../general/sanity.F90"
+
+#if REALCASE == 1
+
+    subroutine v_add_s_&
+    &PRECISION&
+    &(obj, v,n,s)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik)            :: n
+      real(kind=rk)    :: v(n),s
+
+      v(:) = v(:) + s
+    end subroutine v_add_s_&
+    &PRECISION
+
+    subroutine distribute_global_column_&
+    &PRECISION&
+    &(obj, g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+#include "../general/precision_kinds.F90"
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      real(kind=rk)     :: g_col(nlen), l_col(*) ! chnage this to proper 2d 1d matching ! remove assumed size
+      integer(kind=ik)             :: noff, nlen, my_prow, np_rows, nblk
+
+      integer(kind=ik)  :: nbs, nbe, jb, g_off, l_off, js, je
+
+      nbs = noff/(nblk*np_rows)
+      nbe = (noff+nlen-1)/(nblk*np_rows)
+
+      do jb = nbs, nbe
+
+        g_off = jb*nblk*np_rows + nblk*my_prow
+        l_off = jb*nblk
+
+        js = MAX(noff+1-g_off,1)
+        je = MIN(noff+nlen-g_off,nblk)
+
+        if (je<js) cycle
+
+        l_col(l_off+js:l_off+je) = g_col(g_off+js-noff:g_off+je-noff)
+
+      enddo
+    end subroutine distribute_global_column_&
+    &PRECISION
+
+    subroutine solve_secular_equation_&
+    &PRECISION&
+    &(obj, n, i, d, z, delta, rho, dlam)
+    !-------------------------------------------------------------------------------
+    ! This routine solves the secular equation of a symmetric rank 1 modified
+    ! diagonal matrix:
+    !
+    !    1. + rho*SUM(z(:)**2/(d(:)-x)) = 0
+    !
+    ! It does the same as the LAPACK routine DLAED4 but it uses a bisection technique
+    ! which is more robust (it always yields a solution) but also slower
+    ! than the algorithm used in DLAED4.
+    !
+    ! The same restictions than in DLAED4 hold, namely:
+    !
+    !   rho > 0   and   d(i+1) > d(i)
+    !
+    ! but this routine will not terminate with error if these are not satisfied
+    ! (it will normally converge to a pole in this case).
+    !
+    ! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases
+    ! N=1 and N=2 which is not compatible with DLAED4.
+    ! Thus this routine shouldn't be used for these cases as a simple replacement
+    ! of DLAED4.
+    !
+    ! The arguments are the same as in DLAED4 (with the exception of the INFO argument):
+    !
+    !
+    !  N      (input) INTEGER
+    !         The length of all arrays.
+    !
+    !  I      (input) INTEGER
+    !         The index of the eigenvalue to be computed.  1 <= I <= N.
+    !
+    !  D      (input) DOUBLE PRECISION array, dimension (N)
+    !         The original eigenvalues.  It is assumed that they are in
+    !         order, D(I) < D(J)  for I < J.
+    !
+    !  Z      (input) DOUBLE PRECISION array, dimension (N)
+    !         The components of the updating Vector.
+    !
+    !  DELTA  (output) DOUBLE PRECISION array, dimension (N)
+    !         DELTA contains (D(j) - lambda_I) in its  j-th component.
+    !         See remark above about DLAED4 compatibility!
+    !
+    !  RHO    (input) DOUBLE PRECISION
+    !         The scalar in the symmetric updating formula.
+    !
+    !  DLAM   (output) DOUBLE PRECISION
+    !         The computed lambda_I, the I-th updated eigenvalue.
+    !-------------------------------------------------------------------------------
+
+      use precision
+      use elpa_abstract_impl
+      implicit none
+#include "../../src/general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik)           :: n, i
+      real(kind=rk)   :: d(n), z(n), delta(n), rho, dlam
+
+      integer(kind=ik)           :: iter
+      real(kind=rk)   :: a, b, x, y, dshift
+
+      ! In order to obtain sufficient numerical accuracy we have to shift the problem
+      ! either by d(i) or d(i+1), whichever is closer to the solution
+
+      ! Upper and lower bound of the shifted solution interval are a and b
+
+      call obj%timer%start("solve_secular_equation" // PRECISION_SUFFIX)
+      if (i==n) then
+
+       ! Special case: Last eigenvalue
+       ! We shift always by d(n), lower bound is d(n),
+       ! upper bound is determined by a guess:
+
+       dshift = d(n)
+       delta(:) = d(:) - dshift
+
+       a = 0.0_rk ! delta(n)
+       b = rho*SUM(z(:)**2) + 1.0_rk ! rho*SUM(z(:)**2) is the lower bound for the guess
+      else
+
+        ! Other eigenvalues: lower bound is d(i), upper bound is d(i+1)
+        ! We check the sign of the function in the midpoint of the interval
+        ! in order to determine if eigenvalue is more close to d(i) or d(i+1)
+        x = 0.5_rk*(d(i)+d(i+1))
+        y = 1.0_rk + rho*SUM(z(:)**2/(d(:)-x))
+        if (y>0) then
+          ! solution is next to d(i)
+          dshift = d(i)
+        else
+          ! solution is next to d(i+1)
+          dshift = d(i+1)
+        endif
+
+        delta(:) = d(:) - dshift
+        a = delta(i)
+        b = delta(i+1)
+
+      endif
+
+      ! Bisection:
+
+      do iter=1,200
+
+        ! Interval subdivision
+        x = 0.5_rk*(a+b)
+        if (x==a .or. x==b) exit   ! No further interval subdivisions possible
+#ifdef DOUBLE_PRECISION_REAL
+        if (abs(x) < 1.e-200_rk8) exit ! x next to pole
+#else
+        if (abs(x) < 1.e-20_rk4) exit ! x next to pole
+#endif
+        ! evaluate value at x
+
+        y = 1. + rho*SUM(z(:)**2/(delta(:)-x))
+
+        if (y==0) then
+          ! found exact solution
+          exit
+        elseif (y>0) then
+          b = x
+        else
+          a = x
+        endif
+
+      enddo
+
+      ! Solution:
+
+      dlam = x + dshift
+      delta(:) = delta(:) - x
+      call  obj%timer%stop("solve_secular_equation" // PRECISION_SUFFIX)
+
+    end subroutine solve_secular_equation_&
+    &PRECISION
+    !-------------------------------------------------------------------------------
+#endif
+
+#if REALCASE == 1
+    subroutine hh_transform_real_&
+#endif
+#if COMPLEXCASE == 1
+    subroutine hh_transform_complex_&
+#endif
+    &PRECISION &
+                   (obj, alpha, xnorm_sq, xf, tau, wantDebug)
+#if REALCASE  == 1
+      ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
+#endif
+#if COMPLEXCASE == 1
+      ! Similar to LAPACK routine ZLARFP, but uses ||x||**2 instead of x(:)
+#endif
+      ! and returns the factor xf by which x has to be scaled.
+      ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150
+      ! since this would be expensive for the parallel implementation.
+      use precision
+      use elpa_abstract_impl
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout)    :: obj
+      logical, intent(in)                           :: wantDebug
+#if REALCASE == 1
+      real(kind=rk), intent(inout)       :: alpha
+#endif
+#if COMPLEXCASE == 1
+      complex(kind=ck), intent(inout) :: alpha
+#endif
+      real(kind=rk), intent(in)          :: xnorm_sq
+#if REALCASE == 1
+      real(kind=rk), intent(out)         :: xf, tau
+#endif
+#if COMPLEXCASE == 1
+      complex(kind=ck), intent(out)   :: xf, tau
+      real(kind=rk)                      :: ALPHR, ALPHI
+#endif
+
+      real(kind=rk)                      :: BETA
+
+     if (wantDebug) call obj%timer%start("hh_transform_&
+                      &MATH_DATATYPE&
+		      &" // &
+                      &PRECISION_SUFFIX )
+
+#if COMPLEXCASE == 1
+      ALPHR = real( ALPHA, kind=rk )
+      ALPHI = PRECISION_IMAG( ALPHA )
+#endif
+
+#if REALCASE == 1
+      if ( XNORM_SQ==0.0_rk ) then
+#endif
+#if COMPLEXCASE == 1
+      if ( XNORM_SQ==0.0_rk .AND. ALPHI==0.0_rk ) then
+#endif
+
+#if REALCASE == 1
+        if ( ALPHA>=0.0_rk ) then
+#endif
+#if COMPLEXCASE == 1
+        if ( ALPHR>=0.0_rk ) then
+#endif
+          TAU = 0.0_rk
+        else
+          TAU = 2.0_rk
+          ALPHA = -ALPHA
+        endif
+        XF = 0.0_rk
+
+      else
+
+#if REALCASE == 1
+        BETA = SIGN( SQRT( ALPHA**2 + XNORM_SQ ), ALPHA )
+#endif
+#if COMPLEXCASE == 1
+        BETA = SIGN( SQRT( ALPHR**2 + ALPHI**2 + XNORM_SQ ), ALPHR )
+#endif
+        ALPHA = ALPHA + BETA
+        IF ( BETA<0 ) THEN
+          BETA = -BETA
+          TAU  = -ALPHA / BETA
+        ELSE
+#if REALCASE == 1
+          ALPHA = XNORM_SQ / ALPHA
+#endif
+#if COMPLEXCASE == 1
+          ALPHR = ALPHI * (ALPHI/real( ALPHA , kind=rk))
+          ALPHR = ALPHR + XNORM_SQ/real( ALPHA, kind=rk )
+#endif
+
+#if REALCASE == 1
+          TAU = ALPHA / BETA
+          ALPHA = -ALPHA
+#endif
+#if COMPLEXCASE == 1
+          TAU = PRECISION_CMPLX( ALPHR/BETA, -ALPHI/BETA )
+          ALPHA = PRECISION_CMPLX( -ALPHR, ALPHI )
+#endif
+       END IF
+       XF = 1.0_rk/ALPHA
+       ALPHA = BETA
+     endif
+
+      if (wantDebug) call obj%timer%stop("hh_transform_&
+      &MATH_DATATYPE&
+      &" // &
+      &PRECISION_SUFFIX )
+
+#if REALCASE == 1
+    end subroutine hh_transform_real_&
+#endif
+#if COMPLEXCASE == 1
+    end subroutine hh_transform_complex_&
+#endif
+    &PRECISION
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_trans_ev_template.F90 elpa-2019.11.001/src/elpa1/elpa1_trans_ev_template.F90
--- elpa-2016.05.001/src/elpa1/elpa1_trans_ev_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_trans_ev_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,507 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#include "../general/sanity.F90"
+
+!> \brief Transforms the eigenvectors of a tridiagonal matrix back
+!>                     to the eigenvectors of the original matrix
+!>                     (like Scalapack Routine PDORMTR)
+!>
+!  Parameters
+!
+!> \param na          Order of matrix a_mat, number of rows of matrix q_mat
+!>
+!> \param nqc         Number of columns of matrix q_mat
+!>
+!> \param a_mat(lda,matrixCols)  Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
+!>                           Distribution is like in Scalapack.
+!>
+!> \param lda         Leading dimension of a_mat
+!>
+!> \param tau(na)     Factors of the Householder vectors
+!>
+!> \param q_mat           On input: Eigenvectors of tridiagonal matrix
+!>                    On output: Transformed eigenvectors
+!>                    Distribution is like in Scalapack.
+!>
+!> \param ldq         Leading dimension of q_mat
+!>
+!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
+!>
+!> \param matrixCols  local columns of matrix a_mat and q_mat
+!>
+!> \param mpi_comm_rows        MPI-Communicator for rows
+!>
+!> \param mpi_comm_cols        MPI-Communicator for columns
+!>
+!> \param useGPU      If true,  GPU version of the subroutine will be used
+!>
+
+    subroutine trans_ev_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION &
+    (obj, na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
+      use cuda_functions
+      use iso_c_binding
+      use precision
+      use elpa_abstract_impl
+      use elpa_blas_interfaces
+
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout)    :: obj
+      integer(kind=ik), intent(in)                  :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
+      MATH_DATATYPE(kind=rck), intent(in)           :: tau(na)
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(lda,*)
+      MATH_DATATYPE(kind=rck), intent(inout)        :: q_mat(ldq,*)
+#else
+      MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(lda,matrixCols)
+      MATH_DATATYPE(kind=rck), intent(inout)        :: q_mat(ldq,matrixCols)
+#endif
+      logical, intent(in)                           :: useGPU
+      integer(kind=ik)                              :: max_stored_rows, max_stored_rows_fac
+
+      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols
+      integer(kind=MPI_KIND)                        :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+      integer(kind=ik)                              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
+      integer(kind=ik)                              :: l_cols, l_rows, l_colh, nstor
+      integer(kind=ik)                              :: istep, n, nc, ic, ics, ice, nb, cur_pcol
+      integer(kind=ik)                              :: hvn_ubnd, hvm_ubnd
+
+      MATH_DATATYPE(kind=rck), allocatable          :: hvb(:), hvm(:,:)
+      MATH_DATATYPE(kind=rck), allocatable          :: tmp1(:), tmp2(:)
+      MATH_DATATYPE(kind=rck), allocatable          :: h1(:), h2(:)
+      MATH_DATATYPE(kind=rck), allocatable          :: tmat(:,:), hvm1(:)
+
+      integer(kind=ik)                              :: istat
+      character(200)                                :: errorMessage
+      character(20)                                 :: gpuString
+
+      integer(kind=C_intptr_T)                      :: q_dev, tmp_dev, hvm_dev, tmat_dev
+      logical                                       :: successCUDA
+      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
+                                                                          &PRECISION&
+                                                                          &_&
+                                                                          &MATH_DATATYPE
+      integer(kind=ik)                              :: error
+      if(useGPU) then
+        gpuString = "_gpu"
+      else
+        gpuString = ""
+      endif
+
+      call obj%timer%start("trans_ev_&
+      &MATH_DATATYPE&
+      &" // &
+      &PRECISION_SUFFIX //&
+      gpuString)
+
+      call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
+
+      my_prow = int(my_prowMPI, kind=c_int)
+      np_rows = int(np_rowsMPI, kind=c_int)
+      my_pcol = int(my_pcolMPI, kind=c_int)
+      np_cols = int(np_colsMPI, kind=c_int)
+      call obj%timer%stop("mpi_communication")
+
+      call obj%get("max_stored_rows",max_stored_rows_fac, error)
+
+      totalblocks = (na-1)/nblk + 1
+      max_blocks_row = (totalblocks-1)/np_rows + 1
+      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q_mat!
+
+      max_local_rows = max_blocks_row*nblk
+      max_local_cols = max_blocks_col*nblk
+
+      max_stored_rows = (max_stored_rows_fac/nblk+1)*nblk
+
+      allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
+      call check_alloc("trans_ev_&
+      &MATH_DATATYPE&
+      &", "tmat", istat, errorMessage)
+
+      allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
+      call check_alloc("trans_ev_&
+      &MATH_DATATYPE&
+      &", "h1", istat, errorMessage)
+
+      allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
+      call check_alloc("trans_ev_&
+      &MATH_DATATYPE&
+      &", "h2", istat, errorMessage)
+
+      allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
+      call check_alloc("trans_ev_&
+      &MATH_DATATYPE&
+      &", "tmp1", istat, errorMessage)
+
+      allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
+      call check_alloc("trans_ev_&
+      &MATH_DATATYPE&
+      &", "tmp2", istat, errorMessage)
+
+      allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
+      call check_alloc("trans_ev_&
+      &MATH_DATATYPE&
+      &", "hvn", istat, errorMessage)
+
+      allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
+      call check_alloc("trans_ev_&
+      &MATH_DATATYPE&
+      &", "hvm", istat, errorMessage)
+
+      hvm = 0   ! Must be set to 0 !!!
+      hvb = 0   ! Safety only
+
+      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
+
+      nstor = 0
+      if (useGPU) then
+        hvn_ubnd = 0
+      endif
+
+#if COMPLEXCASE == 1
+      ! In the complex case tau(2) /= 0
+      if (my_prow == prow(1, nblk, np_rows)) then
+        q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2))
+      endif
+#endif
+
+      if (useGPU) then
+        ! todo: this is used only for copying hmv to device.. it should be possible to go without it
+        allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
+        call check_alloc("trans_ev_&
+        &MATH_DATATYPE&
+        &", "hvm1", istat, errorMessage)
+
+        successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
+        check_alloc_cuda("trans_ev", successCUDA)
+
+        successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype)
+        check_alloc_cuda("trans_ev", successCUDA)
+
+        successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype)
+        check_alloc_cuda("trans_ev", successCUDA)
+
+        successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_datatype)
+        check_alloc_cuda("trans_ev", successCUDA)
+
+!         q_dev = q_mat
+        successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
+                      ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
+        check_memcpy_cuda("trans_ev", successCUDA)
+      endif  ! useGPU
+
+      do istep = 1, na, nblk
+        ics = MAX(istep,3)
+        ice = MIN(istep+nblk-1,na)
+        if (ice<ics) cycle
+
+        cur_pcol = pcol(istep, nblk, np_cols)
+
+        nb = 0
+        do ic = ics, ice
+
+          l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder Vector
+          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
+
+
+          if (my_pcol == cur_pcol) then
+            hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
+            if (my_prow == prow(ic-1, nblk, np_rows)) then
+              hvb(nb+l_rows) = 1.
+            endif
+          endif
+
+          nb = nb+l_rows
+        enddo
+
+#ifdef WITH_MPI
+        call obj%timer%start("mpi_communication")
+        if (nb>0) &
+          call MPI_Bcast(hvb, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION , int(cur_pcol,kind=MPI_KIND), &
+                         int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+        call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+        nb = 0
+        do ic = ics, ice
+          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector
+          hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
+          if (useGPU) then
+            hvm_ubnd = l_rows
+          endif
+          nstor = nstor+1
+          nb = nb+l_rows
+        enddo
+
+        ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
+        if (nstor+nblk > max_stored_rows .or. istep+nblk > na .or. (na/np_rows <= 256 .and. nstor >= 32)) then
+
+          ! Calculate scalar products of stored vectors.
+          ! This can be done in different ways, we use dsyrk or zherk
+
+          tmat = 0
+          call obj%timer%start("blas")
+          if (l_rows>0) &
+#if REALCASE == 1
+            call PRECISION_SYRK('U', 'T',   &
+#endif
+#if COMPLEXCASE == 1
+            call PRECISION_HERK('U', 'C',   &
+#endif
+                                 int(nstor,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, &
+                                 hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), ZERO, tmat, int(max_stored_rows,kind=BLAS_KIND))
+          call obj%timer%stop("blas")
+          nc = 0
+          do n = 1, nstor-1
+            h1(nc+1:nc+n) = tmat(1:n,n+1)
+            nc = nc+n
+          enddo
+#ifdef WITH_MPI
+          call obj%timer%start("mpi_communication")
+          if (nc>0) call mpi_allreduce( h1, h2, int(nc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, &
+                                       int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+          call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+
+          if (nc > 0) h2 = h1
+
+#endif /* WITH_MPI */
+          ! Calculate triangular matrix T
+
+          nc = 0
+          tmat(1,1) = tau(ice-nstor+1)
+          do n = 1, nstor-1
+            call obj%timer%start("blas")
+            call PRECISION_TRMV('L', BLAS_TRANS_OR_CONJ , 'N', int(n,kind=BLAS_KIND), tmat, &
+                                int(max_stored_rows,kind=BLAS_KIND), h2(nc+1), 1_BLAS_KIND)
+            call obj%timer%stop("blas")
+
+            tmat(n+1,1:n) = &
+#if REALCASE == 1
+            -h2(nc+1:nc+n)  &
+#endif
+#if COMPLEXCASE == 1
+            -conjg(h2(nc+1:nc+n)) &
+#endif
+            *tau(ice-nstor+n+1)
+
+            tmat(n+1,n+1) = tau(ice-nstor+n+1)
+            nc = nc+n
+          enddo
+
+          if (useGPU) then
+            ! todo: is this reshape really neccessary?
+            hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
+
+            !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
+            successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t),   &
+                          hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
+
+            check_memcpy_cuda("trans_ev", successCUDA)
+
+            !tmat_dev = tmat
+            successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t),   &
+                          max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
+            check_memcpy_cuda("trans_ev", successCUDA)
+          endif
+
+          ! Q = Q - V * T * V**T * Q
+
+          if (l_rows>0) then
+            if (useGPU) then
+              call obj%timer%start("cublas")
+              call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',   &
+                                         nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd,  &
+                                         q_dev, ldq, ZERO, tmp_dev, nstor)
+              call obj%timer%stop("cublas")
+
+            else ! useGPU
+
+              call obj%timer%start("blas")
+              call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',  &
+                                  int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
+                                  int(l_rows,kind=BLAS_KIND), ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), &
+                                  q_mat, int(ldq,kind=BLAS_KIND), ZERO, tmp1, int(nstor,kind=BLAS_KIND))
+              call obj%timer%stop("blas")
+            endif ! useGPU
+
+          else !l_rows>0
+
+            if (useGPU) then
+              successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype)
+              check_memcpy_cuda("trans_ev", successCUDA)
+            else
+              tmp1(1:l_cols*nstor) = 0
+            endif
+          endif  !l_rows>0
+
+#ifdef WITH_MPI
+          ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
+          ! todo: does it need to be copied whole? Wouldn't be a part sufficient?
+          if (useGPU) then
+            successCUDA = cuda_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev,  &
+                          max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
+            check_memcpy_cuda("trans_ev", successCUDA)
+          endif
+          call obj%timer%start("mpi_communication")
+          call mpi_allreduce(tmp1, tmp2, int(nstor*l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, &
+                             int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+          call obj%timer%stop("mpi_communication")
+          ! copy back tmp2 - after reduction...
+          if (useGPU) then
+            successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t),  &
+                          max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
+            check_memcpy_cuda("trans_ev", successCUDA)
+          endif ! useGPU
+
+
+#else /* WITH_MPI */
+!          tmp2 = tmp1
+#endif /* WITH_MPI */
+
+          if (l_rows>0) then
+            if (useGPU) then
+              call obj%timer%start("cublas")
+              call cublas_PRECISION_TRMM('L', 'L', 'N', 'N',     &
+                                         nstor, l_cols, ONE, tmat_dev, max_stored_rows,  &
+                                         tmp_dev, nstor)
+
+              call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor,  &
+                                         -ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor,   &
+                                         ONE, q_dev, ldq)
+              call obj%timer%stop("cublas")
+            else !useGPU
+#ifdef WITH_MPI
+              ! tmp2 = tmat * tmp2
+              call obj%timer%start("blas")
+              call PRECISION_TRMM('L', 'L', 'N', 'N', int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND),   &
+                                  ONE, tmat, int(max_stored_rows,kind=BLAS_KIND), tmp2, int(nstor,kind=BLAS_KIND))
+              !q_mat = q_mat - hvm*tmp2
+              call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), int(nstor,kind=BLAS_KIND),   &
+                                  -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), tmp2, int(nstor,kind=BLAS_KIND), &
+                                  ONE, q_mat, int(ldq,kind=BLAS_KIND))
+              call obj%timer%stop("blas")
+#else /* WITH_MPI */
+              call obj%timer%start("blas")
+
+              call PRECISION_TRMM('L', 'L', 'N', 'N', int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND),   &
+                                  ONE, tmat, int(max_stored_rows,kind=BLAS_KIND), tmp1, int(nstor,kind=BLAS_KIND))
+              call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
+                                  int(nstor,kind=BLAS_KIND), -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), &
+                                  tmp1, int(nstor,kind=BLAS_KIND), ONE, q_mat, int(ldq,kind=BLAS_KIND))
+              call obj%timer%stop("blas")
+#endif /* WITH_MPI */
+            endif ! useGPU
+          endif  ! l_rows>0
+          nstor = 0
+        endif  ! (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32))
+
+      enddo ! istep=1,na,nblk
+
+      deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"trans_ev_&
+        &MATH_DATATYPE&
+        &: error when deallocating hvm "//errorMessage
+        stop 1
+      endif
+
+      if (useGPU) then
+        !q_mat = q_dev
+        successCUDA = cuda_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), &
+                      q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
+        check_memcpy_cuda("trans_ev", successCUDA)
+
+        deallocate(hvm1, stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_&
+          &MATH_DATATYPE&
+          &: error when deallocating hvm1 "//errorMessage
+          stop 1
+        endif
+
+        !deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
+        successCUDA = cuda_free(q_dev)
+        check_dealloc_cuda("trans_ev", successCUDA)
+
+        successCUDA = cuda_free(tmp_dev)
+        check_dealloc_cuda("trans_ev", successCUDA)
+
+        successCUDA = cuda_free(hvm_dev)
+        check_dealloc_cuda("trans_ev", successCUDA)
+
+        successCUDA = cuda_free(tmat_dev)
+        check_dealloc_cuda("trans_ev", successCUDA)
+
+      endif
+
+      call obj%timer%stop("trans_ev_&
+      &MATH_DATATYPE&
+      &" // &
+      &PRECISION_SUFFIX // &
+      gpuString )
+
+    end subroutine trans_ev_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION
diff -Nru elpa-2016.05.001/src/elpa1/elpa1_tridiag_template.F90 elpa-2019.11.001/src/elpa1/elpa1_tridiag_template.F90
--- elpa-2016.05.001/src/elpa1/elpa1_tridiag_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa1_tridiag_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1058 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#include "../general/sanity.F90"
+
+#undef SAVE_MATR
+#ifdef DOUBLE_PRECISION_REAL
+#define SAVE_MATR(name, iteration) \
+call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
+#else
+#define SAVE_MATR(name, iteration)
+#endif
+
+!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
+!>
+!  Parameters
+!
+!> \param obj	      object of elpa_type
+!> \param na          Order of matrix
+!>
+!> \param a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
+!>              Distribution is like in Scalapack.
+!>              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
+!>              a(:,:) is overwritten on exit with the Householder vectors
+!>
+!> \param lda         Leading dimension of a
+!>
+!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
+!>
+!> \param matrixCols  local columns of matrix
+!>
+!> \param mpi_comm_rows        MPI-Communicator for rows
+!> \param mpi_comm_cols        MPI-Communicator for columns
+!>
+!> \param d_vec(na)       Diagonal elements (returned), identical on all processors
+!>
+!> \param e_vec(na)       Off-Diagonal elements (returned), identical on all processors
+!>
+!> \param tau(na)     Factors for the Householder vectors (returned), needed for back transformation
+!>
+!> \param useGPU      If true,  GPU version of the subroutine will be used
+!> \param wantDebug   if true more debug information
+!>
+    subroutine tridiag_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION &
+    (obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug, max_threads)
+      use cuda_functions
+      use iso_c_binding
+      use precision
+      use elpa_abstract_impl
+      use matrix_plot
+      use elpa_omp
+      use elpa_blas_interfaces
+
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
+      logical, intent(in)                           :: useGPU, wantDebug
+      integer(kind=c_int)                           :: skewsymmetric
+      logical                                       :: isSkewsymmetric
+
+      MATH_DATATYPE(kind=rck), intent(out)          :: tau(na)
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(lda,*)
+#else
+      MATH_DATATYPE(kind=rck), intent(inout)        :: a_mat(lda,matrixCols)
+#endif
+      real(kind=rk), intent(out)                    :: d_vec(na)
+      real(kind=rk), intent(out)                    :: e_vec(na)
+      integer(kind=ik), parameter                   :: max_stored_uv = 32
+      logical,          parameter                   :: mat_vec_as_one_block = .true.
+
+      ! id in processor row and column and total numbers of processor rows and columns
+      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols
+      integer(kind=MPI_KIND)                        :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+      integer(kind=MPI_KIND)                        :: mpierr
+      integer(kind=ik)                              :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
+                                                       max_local_cols
+      ! updated after each istep (in the main cycle) to contain number of
+      ! local columns and rows of the remaining part of the matrix
+      !integer(kind=ik)                             :: l_cols, l_rows
+      integer(kind=ik)                              :: l_cols, l_rows
+      integer(kind=ik)                              :: n_stored_vecs
+
+      integer(kind=C_intptr_T)                      :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
+                                                       uv_stored_cols_dev
+      logical                                       :: successCUDA
+
+      integer(kind=ik)                              :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
+      integer(kind=ik)                              :: tile_size, l_rows_per_tile, l_cols_per_tile
+      integer(kind=c_intptr_t)                      :: a_offset
+
+      integer(kind=ik), intent(in)                  :: max_threads
+#ifdef WITH_OPENMP
+      integer(kind=ik)                              :: my_thread, n_threads, n_iter
+#endif
+
+      real(kind=rk)                                 :: vnorm2
+      MATH_DATATYPE(kind=rck)                       :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
+#if COMPLEXCASE == 1
+      complex(kind=rck)                             :: aux3(1)
+#endif
+
+      MATH_DATATYPE(kind=rck), allocatable          :: tmp(:)
+      MATH_DATATYPE(kind=rck), allocatable          :: v_row(:), &   ! used to store calculated Householder Vector
+                                                       v_col(:), &   ! the same Vector, but transposed 
+                                                                     ! - differently distributed among MPI tasks
+                                                       u_row(:), &
+                                                       u_col(:)
+      ! the following two matrices store pairs of vectors v and u calculated in each step
+      ! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
+      ! u and v are stored both in row and Vector forms
+      ! pattern: v1,u1,v2,u2,v3,u3,....
+      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
+      MATH_DATATYPE(kind=rck), allocatable         :: vu_stored_rows(:,:)
+      ! pattern: u1,v1,u2,v2,u3,v3,....
+      MATH_DATATYPE(kind=rck), allocatable         :: uv_stored_cols(:,:)
+
+#ifdef WITH_OPENMP
+      MATH_DATATYPE(kind=rck), allocatable         :: ur_p(:,:), uc_p(:,:)
+#endif
+
+      real(kind=rk), allocatable                    :: tmp_real(:)
+      integer(kind=ik)                              :: min_tile_size, error
+      integer(kind=ik)                              :: istat
+      character(200)                                :: errorMessage
+      character(20)                                 :: gpuString
+      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
+                                                                          &PRECISION&
+                                                                          &_&
+                                                                          &MATH_DATATYPE
+      call obj%get("is_skewsymmetric",skewsymmetric,istat)
+      if (istat .ne. ELPA_OK) then
+           print *,"Problem getting option. Aborting..."
+           stop
+      endif
+      isSkewsymmetric = (skewsymmetric == 1)
+
+      if(useGPU) then
+        gpuString = "_gpu"
+      else
+        gpuString = ""
+      endif
+
+      call obj%timer%start("tridiag_&
+      &MATH_DATATYPE&
+      &" // &
+      PRECISION_SUFFIX // &
+      gpuString )
+
+
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
+
+      my_prow = int(my_prowMPI, kind=c_int)
+      np_rows = int(np_rowsMPI, kind=c_int)
+      my_pcol = int(my_pcolMPI, kind=c_int)
+      np_cols = int(np_colsMPI, kind=c_int)
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+
+      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
+      ! seems that tile is a square submatrix, consisting by several blocks
+      ! it is a smallest possible square submatrix, where blocks being distributed among
+      ! processors are "aligned" in both rows and columns
+      !  -----------------
+      ! | 1 4 | 1 4 | 1 4 | ...
+      ! | 2 5 | 2 5 | 2 5 | ...
+      ! | 3 6 | 3 6 | 3 6 | ...
+      !  ----------------- ...
+      ! | 1 4 | 1 4 | 1 4 | ...
+      ! | 2 5 | 2 5 | 2 5 | ...
+      ! | 3 6 | 3 6 | 3 6 | ...
+      !  ----------------- .
+      !   : :   : :   : :    .
+      !   : :   : :   : :      .
+      !
+      ! this is a tile, where each number represents block, assigned to a processor with the shown number
+      ! size of this small block is nblk
+      ! Image is for situation with 6 processors, 3 processor rows and 2 columns
+      ! tile_size is thus nblk * 6
+      !
+      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
+
+      ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
+      ! larger or equal to min_tile_size
+      ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
+      ! it can, however, be set by the user
+      call obj%get("min_tile_size", min_tile_size ,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem setting option. Aborting..."
+        stop
+      endif
+      if(min_tile_size == 0) then
+        ! not set by the user, use the default value
+        min_tile_size = 128*max(np_rows, np_cols)
+      endif
+      tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
+
+      l_rows_per_tile = tile_size/np_rows ! local rows of a tile
+      l_cols_per_tile = tile_size/np_cols ! local cols of a tile
+
+      totalblocks = (na-1)/nblk + 1
+      max_loc_block_rows = (totalblocks-1)/np_rows + 1
+      max_loc_block_cols = (totalblocks-1)/np_cols + 1
+
+      ! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
+      max_local_rows = max_loc_block_rows*nblk
+      max_local_cols = max_loc_block_cols*nblk
+
+      ! allocate memmory for vectors
+      ! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
+      ! todo: if something has length max_local_rows, it is actually a column, no?
+      ! todo: probably one should read it as v_row = Vector v distributed among rows
+      !
+      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "tmp", istat, errorMessage)
+
+      ! allocate v_row 1 element longer to allow store and broadcast tau together with it
+      allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "v_row", istat, errorMessage)
+
+      allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "u_row", istat, errorMessage)
+
+      allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "v_col", istat, errorMessage)
+
+      allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "u_col", istat, errorMessage)
+
+#ifdef WITH_OPENMP
+      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "ur_p", istat, errorMessage)
+
+      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "uc_p", istat, errorMessage)
+#endif /* WITH_OPENMP */
+
+      tmp = 0
+      v_row = 0
+      u_row = 0
+      v_col = 0
+      u_col = 0
+
+      allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
+
+      allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
+      call check_alloc("tridiag_&
+      &MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
+
+      if (useGPU) then
+         successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
+         check_alloc_cuda("tridiag: v_row_dev", successCUDA)
+
+         successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)
+
+         check_alloc_cuda("tridiag: u_row_dev", successCUDA)
+
+         successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
+         check_alloc_cuda("tridiag: v_col_dev", successCUDA)
+
+         successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
+         check_alloc_cuda("tridiag: u_col_dev", successCUDA)
+
+         successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
+         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
+
+         successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
+         check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
+      endif !useGPU
+
+
+      d_vec(:) = 0
+      e_vec(:) = 0
+      tau(:) = 0
+
+      n_stored_vecs = 0
+
+      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
+      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
+
+      if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
+#if COMPLEXCASE == 1
+        d_vec(na) = real(a_mat(l_rows,l_cols), kind=rk)
+#endif
+#if REALCASE == 1
+        d_vec(na) = a_mat(l_rows,l_cols)
+#endif
+
+      if (useGPU) then
+        ! allocate memmory for matrix A on the device and than copy the matrix
+
+        successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
+        check_alloc_cuda("tridiag: a_dev", successCUDA)
+
+        successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
+                      lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
+        check_memcpy_cuda("tridiag: a_dev", successCUDA)
+      endif
+
+      ! main cycle of tridiagonalization
+      ! in each step, 1 Householder Vector is calculated
+      do istep = na, 3 ,-1
+
+        ! Calculate number of local rows and columns of the still remaining matrix
+        ! on the local processor
+        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
+        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
+
+        ! Calculate Vector for Householder transformation on all procs
+        ! owning column istep
+
+        if (my_pcol == pcol(istep, nblk, np_cols)) then
+
+          ! Get Vector to be transformed; distribute last element and norm of
+          ! remaining elements to all procs in current column
+
+          ! copy l_cols + 1 column of A to v_row
+          if (useGPU) then
+            a_offset = l_cols * lda * size_of_datatype
+            ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, 
+            ! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
+
+            successCUDA = cuda_memcpy(int(loc(v_row(1)),kind=c_intptr_t), &
+                                      a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
+            check_memcpy_cuda("tridiag a_dev 1", successCUDA)
+          else
+            v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
+          endif
+
+            if (n_stored_vecs > 0 .and. l_rows > 0) then
+              if (wantDebug) call obj%timer%start("blas")
+#if COMPLEXCASE == 1
+              aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
+#endif
+              call PRECISION_GEMV('N',   &
+                                  int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), &
+                                  ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
+#if REALCASE == 1
+                                  uv_stored_cols(l_cols+1,1), int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
+#endif
+#if COMPLEXCASE == 1
+                                   aux, 1_BLAS_KIND,  &
+
+#endif
+                                  ONE, v_row, 1_BLAS_KIND)
+               if (wantDebug) call obj%timer%stop("blas")
+
+            endif
+
+            if(my_prow == prow(istep-1, nblk, np_rows)) then
+               aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
+               aux1(2) = v_row(l_rows)
+            else
+               aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
+               aux1(2) = 0.
+            endif
+
+#ifdef WITH_MPI
+            if (wantDebug) call obj%timer%start("mpi_communication")
+            call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
+                               MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+            if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+          aux2 = aux1
+#endif /* WITH_MPI */
+
+#if REALCASE == 1
+          vnorm2 = aux2(1)
+#endif
+#if COMPLEXCASE == 1
+          vnorm2 = real(aux2(1),kind=rk)
+#endif
+          vrl    = aux2(2)
+
+          ! Householder transformation
+#if REALCASE == 1
+          call hh_transform_real_&
+#endif
+#if COMPLEXCASE == 1
+          call hh_transform_complex_&
+#endif
+               &PRECISION &
+                             (obj, vrl, vnorm2, xf, tau(istep), wantDebug)
+          ! Scale v_row and store Householder Vector for back transformation
+
+          v_row(1:l_rows) = v_row(1:l_rows) * xf
+          if (my_prow == prow(istep-1, nblk, np_rows)) then
+            v_row(l_rows) = 1.
+
+            ! vrl is newly computed off-diagonal element of the final tridiagonal matrix
+#if REALCASE == 1
+            e_vec(istep-1) = vrl
+#endif
+#if COMPLEXCASE == 1
+            e_vec(istep-1) = real(vrl,kind=rk)
+#endif
+          endif
+
+          ! store Householder Vector for back transformation
+          a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
+
+          ! add tau after the end of actuall v_row, to be broadcasted with it
+          v_row(l_rows+1) = tau(istep)
+         endif !(my_pcol == pcol(istep, nblk, np_cols))
+
+!          SAVE_MATR("HH vec stored", na - istep + 1)
+
+#ifdef WITH_MPI
+         if (wantDebug) call obj%timer%start("mpi_communication")
+         ! Broadcast the Householder Vector (and tau) along columns
+         call MPI_Bcast(v_row, int(l_rows+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
+                        int(pcol(istep, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+         if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+        !recover tau, which has been broadcasted together with v_row
+        tau(istep) =  v_row(l_rows+1)
+
+        ! Transpose Householder Vector v_row -> v_col
+        call elpa_transpose_vectors_&
+             &MATH_DATATYPE&
+             &_&
+             &PRECISION &
+                   (obj, v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
+                    1, istep-1, 1, nblk, max_threads)
+
+        ! Calculate u = (A + VU**T + UV**T)*v
+
+        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
+        ! thus the result is partly in u_col(:) and partly in u_row(:)
+
+        u_col(1:l_cols) = 0
+        u_row(1:l_rows) = 0
+        if (l_rows > 0 .and. l_cols> 0 ) then
+          if(useGPU) then
+            successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
+            check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
+
+            successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
+            check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
+
+            successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
+                          l_cols * size_of_datatype, cudaMemcpyHostToDevice)
+
+            check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
+
+            successCUDA = cuda_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
+                                      l_rows * size_of_datatype, cudaMemcpyHostToDevice)
+            check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
+          endif ! useGU
+
+#ifdef WITH_OPENMP
+          call obj%timer%start("OpenMP parallel")
+!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
+
+          my_thread = omp_get_thread_num()
+          
+          n_threads = omp_get_num_threads()
+
+          n_iter = 0
+
+          ! first calculate A*v part of (A + VU**T + UV**T)*v
+          uc_p(1:l_cols,my_thread) = 0.
+          ur_p(1:l_rows,my_thread) = 0.
+#endif /* WITH_OPENMP */
+          do i= 0, (istep-2)/tile_size
+            l_col_beg = i*l_cols_per_tile+1
+            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
+            if (l_col_end < l_col_beg) cycle
+            do j = 0, i
+              l_row_beg = j*l_rows_per_tile+1
+              l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
+              if (l_row_end < l_row_beg) cycle
+#ifdef WITH_OPENMP
+              if (mod(n_iter,n_threads) == my_thread) then
+                if (wantDebug) call obj%timer%start("blas")
+                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
+                                    int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
+                                    ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND),         &
+                                    v_row(l_row_beg), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
+
+                if (i/=j) then
+                  if (isSkewsymmetric) then
+                    call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
+                                        -ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), v_col(l_col_beg), 1_BLAS_KIND,  &
+                                        ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
+
+                  else
+                    call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
+                                        ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), v_col(l_col_beg), 1_BLAS_KIND,  &
+                                        ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
+                  endif
+                endif
+                if (wantDebug) call obj%timer%stop("blas")
+              endif
+              n_iter = n_iter+1
+#else /* WITH_OPENMP */
+
+              ! multiplication by blocks is efficient only for CPU
+              ! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
+              ! CPU implementation) or by one large matrix Vector multiply
+              if (.not. useGPU) then
+                if (wantDebug) call obj%timer%start("blas")
+                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,  &
+                            int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
+                            ONE, a_mat(l_row_beg, l_col_beg), int(lda,kind=BLAS_KIND),         &
+                            v_row(l_row_beg), 1_BLAS_KIND,                           &
+                            ONE, u_col(l_col_beg), 1_BLAS_KIND)
+
+                if (i/=j) then
+                  if (isSkewsymmetric) then
+                    call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND),  &
+                                        -ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND),               &
+                                        v_col(l_col_beg), 1_BLAS_KIND, ONE, u_row(l_row_beg), 1_BLAS_KIND)
+
+                  else
+                    call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND),  &
+                                        ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND),               &
+                                        v_col(l_col_beg), 1_BLAS_KIND, ONE, u_row(l_row_beg), 1_BLAS_KIND)
+                  endif
+                endif
+                if (wantDebug) call obj%timer%stop("blas")
+              endif ! not useGPU
+
+#endif /* WITH_OPENMP */
+            enddo  ! j=0,i
+          enddo  ! i=0,(istep-2)/tile_size
+
+          if (useGPU) then
+            if(mat_vec_as_one_block) then
+              ! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
+              ! this requires altering of the algorithm when later explicitly updating the matrix
+              ! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
+              if (wantDebug) call obj%timer%start("cublas")
+              call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols,  &
+                                        ONE, a_dev, lda,                   &
+                                        v_row_dev , 1,                          &
+                                        ONE, u_col_dev, 1)
+
+       ! todo: try with non transposed!!!
+!                 if(i/=j) then
+!                   call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1,  &
+!                                             ONE, a_dev + a_offset, lda,                        &
+!                                             v_col_dev + (l_col_beg - 1) *                      &
+!                                             size_of_datatype, 1,                          &
+!                                             ONE, u_row_dev + (l_row_beg - 1) *                 &
+!                                             size_of_datatype, 1)
+!                 endif
+              if (wantDebug) call obj%timer%stop("cublas")
+
+            else
+              !perform multiplication by stripes - it is faster than by blocks, since we call cublas with
+              !larger matrices. In general, however, this algorithm is very simmilar to the one with CPU
+              do i=0,(istep-2)/tile_size
+                  l_col_beg = i*l_cols_per_tile+1
+                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
+                  if(l_col_end<l_col_beg) cycle
+
+                  l_row_beg = 1
+                  l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
+
+                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
+                          size_of_datatype
+
+                  call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
+                              l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
+                              ONE, a_dev + a_offset, lda,  &
+                              v_row_dev + (l_row_beg - 1) * size_of_datatype, 1,  &
+                              ONE, u_col_dev + (l_col_beg - 1) * size_of_datatype, 1)
+              enddo
+
+              do i=0,(istep-2)/tile_size
+                  l_col_beg = i*l_cols_per_tile+1
+                  l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
+                  if(l_col_end<l_col_beg) cycle
+
+                  l_row_beg = 1
+                  l_row_end = min(l_rows,i*l_rows_per_tile)
+
+                  a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
+                          size_of_datatype
+                  if (isSkewsymmetric) then
+                     call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
+                                 -ONE, a_dev + a_offset, lda, &
+                                 v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
+                                 ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
+                  else
+                     call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
+                                 ONE, a_dev + a_offset, lda, &
+                                 v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
+                                 ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
+                  endif
+              enddo
+            end if !multiplication as one block / per stripes
+
+            successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
+                          u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
+            check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
+
+            successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
+                          u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
+            check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
+          endif
+
+!              call PRECISION_SYMV('U', l_cols,  &
+!                         1.d0, a_mat, ubound(a_mat,1),  &
+!                         v_row, 1,  &
+!                         0.d0, u_col, 1)
+
+!            endif ! useGPU
+
+#ifdef WITH_OPENMP
+!$OMP END PARALLEL
+          call obj%timer%stop("OpenMP parallel")
+
+          do i=0,max_threads-1
+            u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
+            u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
+          enddo
+#endif /* WITH_OPENMP */
+
+          ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
+          if (n_stored_vecs > 0) then
+            if (wantDebug) call obj%timer%start("blas")
+#if REALCASE == 1
+            call PRECISION_GEMV('T',     &
+#endif
+#if COMPLEXCASE == 1
+            call PRECISION_GEMV('C',     &
+#endif
+                                int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND),   &
+                                ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND),   &
+                                v_row,  1_BLAS_KIND, ZERO, aux, 1_BLAS_KIND)
+
+            call PRECISION_GEMV('N', int(l_cols,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND),   &
+                                ONE, uv_stored_cols, int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND),   &
+                                aux, 1_BLAS_KIND, ONE, u_col,  1_BLAS_KIND)
+            if (wantDebug) call obj%timer%stop("blas")
+          endif
+
+        endif  ! (l_rows>0 .and. l_cols>0)
+
+        ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
+        ! on the processors containing the diagonal
+        ! This is only necessary if u_row has been calculated, i.e. if the
+        ! global tile size is smaller than the global remaining matrix
+
+        if (tile_size < istep-1) then
+
+          call elpa_reduce_add_vectors_&
+          &MATH_DATATYPE&
+          &_&
+          &PRECISION &
+          (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
+           mpi_comm_cols, istep-1, 1, nblk, max_threads)
+
+        endif
+
+        ! Sum up all the u_col(:) parts, transpose u_col -> u_row
+
+        if (l_cols>0) then
+          tmp(1:l_cols) = u_col(1:l_cols)
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call mpi_allreduce(tmp, u_col, int(l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
+                             MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+          u_col = tmp
+#endif /* WITH_MPI */
+        endif
+        if (isSkewsymmetric) then
+           call elpa_transpose_vectors_ss_&
+           &MATH_DATATYPE&
+           &_&
+           &PRECISION &
+           (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
+            mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
+        else
+           call elpa_transpose_vectors_&
+           &MATH_DATATYPE&
+           &_&
+           &PRECISION &
+           (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
+            mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
+        endif
+
+        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
+        x = 0
+        if (l_cols>0)  &
+           x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
+
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call mpi_allreduce(x, vav, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+
+        vav = x
+
+#endif /* WITH_MPI */
+
+        ! store u and v in the matrices U and V
+        ! these matrices are stored combined in one here
+
+        do j=1,l_rows
+#if REALCASE == 1
+          vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
+          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
+#endif
+#if COMPLEXCASE == 1
+          vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
+          vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
+#endif
+        enddo
+        do j=1,l_cols
+#if REALCASE == 1
+          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
+          uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
+#endif
+#if COMPLEXCASE == 1
+          uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
+          uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
+#endif
+        enddo
+
+        ! We have calculated another Hauseholder Vector, number of implicitly stored increased
+        n_stored_vecs = n_stored_vecs+1
+
+        ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
+        if (n_stored_vecs == max_stored_uv .or. istep == 3) then
+
+          if (useGPU) then
+            successCUDA = cuda_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), &
+                                      max_local_rows * 2 * max_stored_uv *          &
+                                      size_of_datatype, cudaMemcpyHostToDevice)
+            check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
+
+            successCUDA = cuda_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), &
+                                      max_local_cols * 2 * max_stored_uv *          &
+                                      size_of_datatype, cudaMemcpyHostToDevice)
+            check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
+          endif
+
+          do i = 0, (istep-2)/tile_size
+            ! go over tiles above (or on) the diagonal
+            l_col_beg = i*l_cols_per_tile+1
+            l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
+            l_row_beg = 1
+            l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
+            if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
+              cycle
+
+
+            if (useGPU) then
+              if(.not. mat_vec_as_one_block) then
+                ! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
+                ! we than use the same calls as for CPU version
+                if (wantDebug) call obj%timer%start("cublas")
+                call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,     &
+                                          l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs,                      &
+                                          ONE, vu_stored_rows_dev + (l_row_beg - 1) *                                         &
+                                          size_of_datatype,  &
+                                          max_local_rows, uv_stored_cols_dev + (l_col_beg - 1) *                              &
+                                          size_of_datatype,  &
+                                          max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) *            &
+                                          size_of_datatype , lda)
+                if (wantDebug) call obj%timer%stop("cublas")
+              endif
+            else !useGPU
+              if (wantDebug) call obj%timer%start("blas")
+              call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,                &
+                                   int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
+                                   int(2*n_stored_vecs,kind=BLAS_KIND),    &
+                                   ONE, vu_stored_rows(l_row_beg,1), int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND),   &
+                                   uv_stored_cols(l_col_beg,1), int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND),        &
+                                   ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND))
+              if (wantDebug) call obj%timer%stop("blas")
+            endif !useGPU
+          enddo
+
+          if (useGPU) then
+            if(mat_vec_as_one_block) then
+              !update whole (remaining) part of matrix, including tiles below diagonal
+              !we can do that in one large cublas call
+              if (wantDebug) call obj%timer%start("cublas")
+              call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs,   &
+                                        ONE, vu_stored_rows_dev, max_local_rows, &
+                                        uv_stored_cols_dev, max_local_cols,  &
+                                        ONE, a_dev, lda)
+              if (wantDebug) call obj%timer%stop("cublas")
+            endif
+          endif
+
+          n_stored_vecs = 0
+        endif
+
+        if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then
+          if (useGPU) then
+            !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
+             a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
+
+             successCUDA = cuda_memcpy(int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), a_dev + a_offset, &
+                                       1 *  size_of_datatype, cudaMemcpyDeviceToHost)
+             check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
+
+          endif
+          if (n_stored_vecs > 0) then
+            a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
+                        + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
+          end if
+#if REALCASE == 1
+          if (isSkewsymmetric) then
+            d_vec(istep-1) = 0.0_rk
+          else
+            d_vec(istep-1) = a_mat(l_rows,l_cols)
+          endif
+#endif
+#if COMPLEXCASE == 1
+          d_vec(istep-1) = real(a_mat(l_rows,l_cols),kind=rk)
+#endif
+
+          if (useGPU) then
+            !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
+            !successCUDA = cuda_threadsynchronize()
+            !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)
+
+             successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), &
+                                       int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice)
+             check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
+          endif
+        endif
+
+      enddo ! main cycle over istep=na,3,-1
+
+#if COMPLEXCASE == 1
+      ! Store e_vec(1) and d_vec(1)
+
+      if (my_pcol==pcol(2, nblk, np_cols)) then
+        if (my_prow==prow(1, nblk, np_rows)) then
+          ! We use last l_cols value of loop above
+          if(useGPU) then
+            successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
+                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
+            check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
+            vrl = aux3(1)
+          else !useGPU
+            vrl = a_mat(1,l_cols)
+          endif !useGPU
+          call hh_transform_complex_&
+                                    &PRECISION &
+                                    (obj, vrl, 0.0_rk, xf, tau(2), wantDebug)
+#if REALCASE == 1
+          e_vec(1) = vrl
+#endif
+#if COMPLEXCASE == 1
+          e_vec(1) = real(vrl,kind=rk)
+#endif
+
+
+          a_mat(1,l_cols) = 1. ! for consistency only
+        endif
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(prow(1, nblk, np_rows),kind=MPI_KIND), &
+                       int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+      endif
+
+#ifdef WITH_MPI
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(pcol(2, nblk, np_cols),kind=MPI_KIND), &
+                     int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+      if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols))  then
+        if(useGPU) then
+          successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, &
+                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
+          check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
+          d_vec(1) = PRECISION_REAL(aux3(1))
+        else !useGPU
+          d_vec(1) = PRECISION_REAL(a_mat(1,1))
+        endif !useGPU
+      endif
+
+#endif /* COMPLEXCASE == 1 */
+
+#if REALCASE == 1
+      ! Store e_vec(1)
+
+      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
+        if(useGPU) then
+          successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
+                                    1 * size_of_datatype, cudaMemcpyDeviceToHost)
+          check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
+        else !useGPU
+          e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
+        endif !useGPU
+      endif
+
+     ! Store d_vec(1)
+      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
+        if(useGPU) then
+          successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
+          check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
+        else !useGPU
+          if (isSkewsymmetric) then
+            d_vec(1) = 0.0_rk
+          else
+            d_vec(1) = a_mat(1,1)
+          endif
+        endif !useGPU
+      endif
+#endif
+
+      deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag: error when deallocating "//errorMessage
+        stop 1
+      endif
+
+      if (useGPU) then
+        ! todo: should we leave a_mat on the device for further use?
+        successCUDA = cuda_free(a_dev)
+        check_dealloc_cuda("tridiag: a_dev 9", successCUDA)
+
+        successCUDA = cuda_free(v_row_dev)
+        check_dealloc_cuda("tridiag: v_row_dev", successCUDA)
+
+        successCUDA = cuda_free(u_row_dev)
+        check_dealloc_cuda("tridiag: (u_row_dev", successCUDA)
+
+        successCUDA = cuda_free(v_col_dev)
+        check_dealloc_cuda("tridiag: v_col_dev", successCUDA)
+
+        successCUDA = cuda_free(u_col_dev)
+        check_dealloc_cuda("tridiag: u_col_dev ", successCUDA)
+
+        successCUDA = cuda_free(vu_stored_rows_dev)
+        check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successCUDA)
+
+        successCUDA = cuda_free(uv_stored_cols_dev)
+        check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successCUDA)
+      endif
+
+      ! distribute the arrays d_vec and e_vec to all processors
+
+      allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag: error when allocating tmp_real "//errorMessage
+        stop 1
+      endif
+
+
+#ifdef WITH_MPI
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      tmp_real = d_vec
+      call mpi_allreduce(tmp_real, d_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
+                         int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+      tmp_real = d_vec
+      call mpi_allreduce(tmp_real, d_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
+                         int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+      tmp_real = e_vec
+      call mpi_allreduce(tmp_real, e_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
+                         int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+      tmp_real = e_vec
+      call mpi_allreduce(tmp_real, e_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
+                         int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+      deallocate(tmp_real, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag: error when deallocating tmp_real "//errorMessage
+        stop 1
+      endif
+
+      call obj%timer%stop("tridiag_&
+      &MATH_DATATYPE&
+      &" // &
+      PRECISION_SUFFIX // &
+      gpuString )
+
+    end subroutine tridiag_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION
diff -Nru elpa-2016.05.001/src/elpa1/elpa_cholesky_template.F90 elpa-2019.11.001/src/elpa1/elpa_cholesky_template.F90
--- elpa-2016.05.001/src/elpa1/elpa_cholesky_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa_cholesky_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,366 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+
+#include "../general/sanity.F90"
+     use elpa1_compute
+     use elpa_utilities
+     use elpa_mpi
+     use precision
+     use elpa_abstract_impl
+     use elpa_omp
+     use elpa_blas_interfaces
+
+     implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck)      :: a(obj%local_nrows,*)
+#else
+      MATH_DATATYPE(kind=rck)      :: a(obj%local_nrows,obj%local_ncols)
+#endif
+      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols
+      integer(kind=MPI_KIND)        :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+      integer(kind=ik)              :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
+      integer(kind=ik)              :: n, nc, i, info
+      integer(kind=BLAS_KIND)       :: infoBLAS
+      integer(kind=ik)              :: lcs, lce, lrs, lre
+      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
+
+      MATH_DATATYPE(kind=rck), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
+      logical                       :: wantDebug
+      logical                       :: success
+      integer(kind=ik)              :: istat, debug, error
+      character(200)                :: errorMessage
+      integer(kind=ik)              :: nrThreads
+
+      call obj%timer%start("elpa_cholesky_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &")
+
+#ifdef WITH_OPENMP
+      ! store the number of OpenMP threads used in the calling function
+      ! restore this at the end of ELPA 2
+      omp_threads_caller = omp_get_max_threads()
+
+      ! check the number of threads that ELPA should use internally
+      call obj%get("omp_threads",nrThreads,error)
+      call omp_set_num_threads(nrThreads)
+#else
+      nrThreads=1
+#endif
+
+      na         = obj%na
+      lda        = obj%local_nrows
+      nblk       = obj%nblk
+      matrixCols = obj%local_ncols
+
+      call obj%get("mpi_comm_rows",mpi_comm_rows,error )
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+      call obj%get("mpi_comm_cols",mpi_comm_cols,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+
+      call obj%get("debug",debug,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+      if (debug == 1) then
+        wantDebug = .true.
+      else
+        wantDebug = .false.
+      endif
+
+      call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
+
+      my_prow = int(my_prowMPI, kind=c_int)
+      np_rows = int(np_rowsMPI, kind=c_int)
+      my_pcol = int(my_pcolMPI, kind=c_int)
+      np_cols = int(np_colsMPI, kind=c_int)
+      call obj%timer%stop("mpi_communication")
+      success = .true.
+
+      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
+
+      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
+      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
+
+      l_rows_tile = tile_size/np_rows ! local rows of a tile
+      l_cols_tile = tile_size/np_cols ! local cols of a tile
+
+      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
+      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
+
+      allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_cholesky_&
+  &MATH_DATATYPE&: error when allocating tmp1 "//errorMessage
+        stop 1
+      endif
+
+      allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_cholesky_&
+  &MATH_DATATYPE&
+  &: error when allocating tmp2 "//errorMessage
+        stop 1
+      endif
+
+      tmp1 = 0
+      tmp2 = 0
+
+      allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_cholesky_&
+  &MATH_DATATYPE&
+  &: error when allocating tmatr "//errorMessage
+        stop 1
+      endif
+
+      allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_cholesky_&
+  &MATH_DATATYPE&
+  &: error when allocating tmatc "//errorMessage
+        stop 1
+      endif
+
+      tmatr = 0
+      tmatc = 0
+
+      do n = 1, na, nblk
+        ! Calculate first local row and column of the still remaining matrix
+        ! on the local processor
+
+        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
+        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
+
+        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
+        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
+
+        if (n+nblk > na) then
+
+          ! This is the last step, just do a Cholesky-Factorization
+          ! of the remaining block
+
+          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
+            call obj%timer%start("blas")
+
+            call PRECISION_POTRF('U', int(na-n+1,kind=BLAS_KIND), a(l_row1,l_col1), &
+                                 int(lda,kind=BLAS_KIND), infoBLAS )
+            info = int(infoBLAS,kind=ik)
+            call obj%timer%stop("blas")
+
+            if (info/=0) then
+              if (wantDebug) write(error_unit,*) "elpa_cholesky_&
+        &MATH_DATATYPE&
+
+#if REALCASE == 1
+        &: Error in dpotrf: ",info
+#endif
+#if COMPLEXCASE == 1
+              &: Error in zpotrf: ",info
+#endif
+              success = .false.
+              return
+            endif
+
+          endif
+
+          exit ! Loop
+
+        endif
+
+        if (my_prow==prow(n, nblk, np_rows)) then
+
+          if (my_pcol==pcol(n, nblk, np_cols)) then
+
+            ! The process owning the upper left remaining block does the
+            ! Cholesky-Factorization of this block
+            call obj%timer%start("blas")
+
+            call PRECISION_POTRF('U', int(nblk,kind=BLAS_KIND), a(l_row1,l_col1), &
+                                 int(lda,kind=BLAS_KIND) , infoBLAS )
+            info = int(infoBLAS,kind=ik)
+            call obj%timer%stop("blas")
+
+            if (info/=0) then
+              if (wantDebug) write(error_unit,*) "elpa_cholesky_&
+        &MATH_DATATYPE&
+
+#if REALCASE == 1
+        &: Error in dpotrf 2: ",info
+#endif
+#if COMPLEXCASE == 1
+        &: Error in zpotrf 2: ",info
+
+#endif
+              success = .false.
+              return
+            endif
+
+            nc = 0
+            do i=1,nblk
+              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
+              nc = nc+i
+            enddo
+          endif
+#ifdef WITH_MPI
+          call obj%timer%start("mpi_communication")
+
+          call MPI_Bcast(tmp1, int(nblk*(nblk+1)/2,kind=MPI_KIND),      &
+#if REALCASE == 1
+                         MPI_REAL_PRECISION,         &
+#endif
+#if COMPLEXCASE == 1
+                         MPI_COMPLEX_PRECISION,      &
+#endif
+                         int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+
+          call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+          nc = 0
+          do i=1,nblk
+            tmp2(1:i,i) = tmp1(nc+1:nc+i)
+            nc = nc+i
+          enddo
+
+          call obj%timer%start("blas")
+          if (l_cols-l_colx+1>0) &
+              call PRECISION_TRSM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', int(nblk,kind=BLAS_KIND),  &
+                                  int(l_cols-l_colx+1,kind=BLAS_KIND), ONE, tmp2, &
+                                  int(ubound(tmp2,dim=1),kind=BLAS_KIND), a(l_row1,l_colx), int(lda,kind=BLAS_KIND) )
+          call obj%timer%stop("blas")
+        endif
+
+        do i=1,nblk
+
+#if REALCASE == 1
+          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
+#endif
+#if COMPLEXCASE == 1
+          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols))
+#endif
+
+#ifdef WITH_MPI
+
+          call obj%timer%start("mpi_communication")
+          if (l_cols-l_colx+1>0) &
+            call MPI_Bcast(tmatc(l_colx,i), int(l_cols-l_colx+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
+                           int(prow(n, nblk, np_rows),kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+
+          call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+        enddo
+        ! this has to be checked since it was changed substantially when doing type safe
+        call elpa_transpose_vectors_&
+  &MATH_DATATYPE&
+  &_&
+  &PRECISION &
+                 (obj, tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
+                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
+                                      n, na, nblk, nblk, nrThreads)
+
+        do i=0,(na-1)/tile_size
+          lcs = max(l_colx,i*l_cols_tile+1)
+          lce = min(l_cols,(i+1)*l_cols_tile)
+          lrs = l_rowx
+          lre = min(l_rows,(i+1)*l_rows_tile)
+          if (lce<lcs .or. lre<lrs) cycle
+          call obj%timer%start("blas")
+          call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(lre-lrs+1,kind=BLAS_KIND), int(lce-lcs+1,kind=BLAS_KIND), &
+                              int(nblk,kind=BLAS_KIND), -ONE,  &
+                              tmatr(lrs,1), int(ubound(tmatr,dim=1),kind=BLAS_KIND), tmatc(lcs,1), &
+                              int(ubound(tmatc,dim=1),kind=BLAS_KIND), &
+                              ONE, a(lrs,lcs), int(lda,kind=BLAS_KIND))
+          call obj%timer%stop("blas")
+
+        enddo
+
+      enddo
+
+      deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_cholesky_&
+  &MATH_DATATYPE&
+  &: error when deallocating tmp1 "//errorMessage
+        stop 1
+      endif
+
+      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)
+
+      do i=1,na
+        if (my_pcol==pcol(i, nblk, np_cols)) then
+          ! column i is on local processor
+          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
+          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
+          a(l_row1:l_rows,l_col1) = 0
+        endif
+      enddo
+
+      ! restore original OpenMP settings
+#ifdef WITH_OPENMP
+      ! store the number of OpenMP threads used in the calling function
+      ! restore this at the end of ELPA 2
+      call omp_set_num_threads(omp_threads_caller)
+#endif
+      
+      call obj%timer%stop("elpa_cholesky_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &")
diff -Nru elpa-2016.05.001/src/elpa1/elpa_invert_trm.F90 elpa-2019.11.001/src/elpa1/elpa_invert_trm.F90
--- elpa-2016.05.001/src/elpa1/elpa_invert_trm.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa_invert_trm.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,290 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+#include "../general/sanity.F90"
+
+       use precision
+       use elpa1_compute
+       use elpa_utilities
+       use elpa_mpi
+       use elpa_abstract_impl
+       use elpa_blas_interfaces
+
+       implicit none
+#include "../general/precision_kinds.F90"
+       class(elpa_abstract_impl_t), intent(inout) :: obj
+       integer(kind=ik)             :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
+#ifdef USE_ASSUMED_SIZE
+       MATH_DATATYPE(kind=rck)     :: a(obj%local_nrows,*)
+#else
+       MATH_DATATYPE(kind=rck)     :: a(obj%local_nrows,obj%local_ncols)
+#endif
+
+       integer(kind=ik)             :: my_prow, my_pcol, np_rows, np_cols
+       integer(kind=MPI_KIND)       :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+       integer(kind=ik)             :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
+       integer(kind=ik)             :: n, nc, i, info, ns, nb
+       integer(kind=BLAS_KIND)      :: infoBLAS
+       MATH_DATATYPE(kind=rck), allocatable   :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
+       logical                      :: wantDebug
+       logical                      :: success
+       integer(kind=ik)             :: istat, debug, error
+       character(200)               :: errorMessage
+
+      call obj%timer%start("elpa_invert_trm_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &")
+
+       na         = obj%na
+       lda        = obj%local_nrows
+       nblk       = obj%nblk
+       matrixCols = obj%local_ncols
+
+       call obj%get("mpi_comm_rows",mpi_comm_rows,error)
+       if (error .ne. ELPA_OK) then
+         print *,"Error getting option. Aborting..."
+         stop
+       endif
+       call obj%get("mpi_comm_cols",mpi_comm_cols,error)
+       if (error .ne. ELPA_OK) then
+         print *,"Error getting option. Aborting..."
+         stop
+       endif
+
+       call obj%get("debug", debug,error)
+       if (error .ne. ELPA_OK) then
+         print *,"Error getting option. Aborting..."
+         stop
+       endif
+       if (debug == 1) then
+         wantDebug = .true.
+       else
+         wantDebug = .true.
+       endif
+       call obj%timer%start("mpi_communication")
+       call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
+       call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
+       call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), my_pcolMPI, mpierr)
+       call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
+
+       my_prow = int(my_prowMPI,kind=c_int)
+       np_rows = int(np_rowsMPI,kind=c_int)
+       my_pcol = int(my_pcolMPI,kind=c_int)
+       np_cols = int(np_colsMPI,kind=c_int)
+       call obj%timer%stop("mpi_communication")
+       success = .true.
+
+       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
+       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
+
+       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"elpa_invert_trm_&
+   &MATH_DATATYPE&
+   &: error when allocating tmp1 "//errorMessage
+         stop 1
+       endif
+
+       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"elpa_invert_trm_&
+   &MATH_DATATYPE&
+   &: error when allocating tmp2 "//errorMessage
+         stop 1
+       endif
+
+       tmp1 = 0
+       tmp2 = 0
+
+       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"elpa_invert_trm_&
+   &MATH_DATATYPE&
+   &: error when allocating tmat1 "//errorMessage
+         stop 1
+       endif
+
+       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"elpa_invert_trm_&
+   &MATH_DATATYPE&
+   &: error when allocating tmat2 "//errorMessage
+         stop 1
+       endif
+
+       tmat1 = 0
+       tmat2 = 0
+
+
+       ns = ((na-1)/nblk)*nblk + 1
+
+       do n = ns,1,-nblk
+
+         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
+         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
+
+         nb = nblk
+         if (na-n+1 < nblk) nb = na-n+1
+
+         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
+         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)
+
+         if (my_prow==prow(n, nblk, np_rows)) then
+
+           if (my_pcol==pcol(n, nblk, np_cols)) then
+             call obj%timer%start("blas")
+
+             call PRECISION_TRTRI('U', 'N', int(nb,kind=BLAS_KIND), a(l_row1,l_col1), int(lda,kind=BLAS_KIND), &
+                                  infoBLAS)
+             info = int(infoBLAS,kind=ik)
+             call obj%timer%stop("blas")
+
+             if (info/=0) then
+               if (wantDebug) write(error_unit,*) "elpa_invert_trm_&
+         &MATH_DATATYPE&
+
+#if REALCASE == 1
+         &: Error in DTRTRI"
+#endif
+#if COMPLEXCASE == 1
+         &: Error in ZTRTRI"
+#endif
+
+               success = .false.
+               call obj%timer%stop("elpa_invert_trm_&
+               &MATH_DATATYPE&
+               &_&
+               &PRECISION&
+               &")
+               return
+             endif
+
+             nc = 0
+             do i=1,nb
+               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
+               nc = nc+i
+             enddo
+           endif
+#ifdef WITH_MPI
+           call obj%timer%start("mpi_communication")
+           call MPI_Bcast(tmp1, int(nb*(nb+1)/2,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,       &
+                          int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+           call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+           nc = 0
+           do i=1,nb
+             tmp2(1:i,i) = tmp1(nc+1:nc+i)
+             nc = nc+i
+           enddo
+
+           call obj%timer%start("blas")
+           if (l_cols-l_colx+1>0) &
+           call PRECISION_TRMM('L', 'U', 'N', 'N', int(nb,kind=BLAS_KIND), int(l_cols-l_colx+1,kind=BLAS_KIND), ONE, &
+                                   tmp2, int(ubound(tmp2,dim=1),kind=BLAS_KIND), a(l_row1,l_colx), int(lda,kind=BLAS_KIND))
+           call obj%timer%stop("blas")
+           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
+           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0
+
+         endif
+
+         if (l_row1>1) then
+           if (my_pcol==pcol(n, nblk, np_cols)) then
+             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
+             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
+           endif
+
+           do i=1,nb
+#ifdef WITH_MPI
+             call obj%timer%start("mpi_communication")
+             call MPI_Bcast(tmat1(1,i), int(l_row1-1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
+                            int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+
+             call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+           enddo
+         endif
+#ifdef WITH_MPI
+         call obj%timer%start("mpi_communication")
+         if (l_cols-l_col1+1>0) &
+      call MPI_Bcast(tmat2(1,l_col1), int((l_cols-l_col1+1)*nblk,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
+                     int(prow(n, nblk, np_rows),kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+
+          call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+         call obj%timer%start("blas")
+         if (l_row1>1 .and. l_cols-l_col1+1>0) &
+           call PRECISION_GEMM('N', 'N', int(l_row1-1,kind=BLAS_KIND), int(l_cols-l_col1+1,kind=BLAS_KIND), &
+                               int(nb,kind=BLAS_KIND), -ONE, &
+                                tmat1, int(ubound(tmat1,dim=1),kind=BLAS_KIND), tmat2(1,l_col1), &
+                                int(ubound(tmat2,dim=1),kind=BLAS_KIND), ONE, &
+                                 a(1,l_col1), int(lda,kind=BLAS_KIND) )
+
+         call obj%timer%stop("blas")
+
+       enddo
+
+       deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"elpa_invert_trm_&
+   &MATH_DATATYPE&
+   &: error when deallocating tmp1 "//errorMessage
+         stop 1
+       endif
+
+       call obj%timer%stop("elpa_invert_trm_&
+       &MATH_DATATYPE&
+       &_&
+       &PRECISION&
+       &")
diff -Nru elpa-2016.05.001/src/elpa1/elpa_multiply_a_b.F90 elpa-2019.11.001/src/elpa1/elpa_multiply_a_b.F90
--- elpa-2016.05.001/src/elpa1/elpa_multiply_a_b.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa_multiply_a_b.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,342 @@
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: A. Marek, MPCDF
+
+
+#include "../general/sanity.F90"
+
+      use elpa1_compute
+      use elpa_mpi
+      use precision
+      use elpa_abstract_impl
+      use elpa_blas_interfaces
+      implicit none
+
+#include "../../src/general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+
+      character*1                   :: uplo_a, uplo_c
+
+      integer(kind=ik), intent(in)  :: ldb, ldbCols, ldc, ldcCols
+      integer(kind=ik)              :: na, ncb
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck)       :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
+#else
+      MATH_DATATYPE(kind=rck)       :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbCols), c(ldc,ldcCols)
+#endif
+      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols
+      integer(kind=MPI_KIND)        :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+      integer(kind=ik)              :: l_cols, l_rows, l_rows_np
+      integer(kind=ik)              :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
+      integer(kind=ik)              :: gcol_min, gcol, goff
+      integer(kind=ik)              :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
+      integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
+      integer(kind=MPI_KIND)        :: mpierr
+
+      logical                       :: a_lower, a_upper, c_lower, c_upper
+      MATH_DATATYPE(kind=rck), allocatable    :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:)
+      integer(kind=ik)              :: istat
+      character(200)                :: errorMessage
+      logical                       :: success
+      integer(kind=ik)              :: nblk, mpi_comm_rows, mpi_comm_cols, lda, ldaCols, error
+
+      call obj%timer%start("elpa_mult_at_b_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &")
+
+      na   = obj%na
+      nblk = obj%nblk
+      lda  = obj%local_nrows
+      ldaCols  = obj%local_ncols
+
+
+      call obj%get("mpi_comm_rows",mpi_comm_rows,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+      call obj%get("mpi_comm_cols",mpi_comm_cols,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+
+
+      success = .true.
+
+      call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
+
+      my_prow = int(my_prowMPI,kind=c_int)
+      np_rows = int(np_rowsMPI,kind=c_int)
+      my_pcol = int(my_pcolMPI,kind=c_int)
+      np_cols = int(np_colsMPI,kind=c_int)
+      call obj%timer%stop("mpi_communication")
+      l_rows = local_index(na,  my_prow, np_rows, nblk, -1) ! Local rows of a and b
+      l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
+
+      ! Block factor for matrix multiplications, must be a multiple of nblk
+
+      if (na/np_rows<=256) then
+         nblk_mult = (31/nblk+1)*nblk
+      else
+         nblk_mult = (63/nblk+1)*nblk
+      endif
+
+      allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_mult_at_b_&
+  &MATH_DATATYPE&
+  &: error when allocating aux_mat "//errorMessage
+        stop 1
+      endif
+
+      allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_mult_at_b_&
+  &MATH_DATATYPE&
+  &: error when allocating aux_bc "//errorMessage
+        stop 1
+      endif
+
+      allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_mult_at_b_&
+        &MATH_DATATYPE&
+        &: error when allocating lrs_save "//errorMessage
+        stop 1
+      endif
+
+      allocate(lre_save(nblk), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"elpa_mult_at_b_&
+        &MATH_DATATYPE&
+        &: error when allocating lre_save "//errorMessage
+        stop 1
+      endif
+
+      a_lower = .false.
+      a_upper = .false.
+      c_lower = .false.
+      c_upper = .false.
+
+      if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
+      if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
+      if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
+      if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
+
+      ! Build up the result matrix by processor rows
+
+      do np = 0, np_rows-1
+
+        ! In this turn, procs of row np assemble the result
+
+        l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
+
+        nr_done = 0 ! Number of rows done
+        aux_mat = 0
+        nstor = 0   ! Number of columns stored in aux_mat
+
+        ! Loop over the blocks on row np
+
+        do nb=0,(l_rows_np-1)/nblk
+
+          goff  = nb*np_rows + np ! Global offset in blocks corresponding to nb
+
+          ! Get the processor column which owns this block (A is transposed, so we need the column)
+          ! and the offset in blocks within this column.
+          ! The corresponding block column in A is then broadcast to all for multiplication with B
+
+          np_bc = MOD(goff,np_cols)
+          noff = goff/np_cols
+          n_aux_bc = 0
+
+          ! Gather up the complete block column of A on the owner
+
+          do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast
+
+            gcol = goff*nblk + n ! global column corresponding to n
+            if (nstor==0 .and. n==1) gcol_min = gcol
+
+            lrs = 1       ! 1st local row number for broadcast
+            lre = l_rows  ! last local row number for broadcast
+            if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
+            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
+
+            if (lrs<=lre) then
+              nvals = lre-lrs+1
+              if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
+              n_aux_bc = n_aux_bc + nvals
+            endif
+
+            lrs_save(n) = lrs
+            lre_save(n) = lre
+
+          enddo
+
+          ! Broadcast block column
+#ifdef WITH_MPI
+          call obj%timer%start("mpi_communication")
+#if REALCASE == 1
+          call MPI_Bcast(aux_bc, int(n_aux_bc,kind=MPI_KIND),    &
+                         MPI_REAL_PRECISION,  &
+                         int(np_bc,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+#endif
+#if COMPLEXCASE == 1
+          call MPI_Bcast(aux_bc, int(n_aux_bc,kind=MPI_KIND),    &
+                         MPI_COMPLEX_PRECISION,  &
+                         int(np_bc,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+#endif
+          call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+          ! Insert what we got in aux_mat
+
+          n_aux_bc = 0
+          do n = 1, min(l_rows_np-nb*nblk,nblk)
+            nstor = nstor+1
+            lrs = lrs_save(n)
+            lre = lre_save(n)
+            if (lrs<=lre) then
+              nvals = lre-lrs+1
+              aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
+              n_aux_bc = n_aux_bc + nvals
+            endif
+          enddo
+
+          ! If we got nblk_mult columns in aux_mat or this is the last block
+          ! do the matrix multiplication
+
+          if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then
+
+            lrs = 1       ! 1st local row number for multiply
+            lre = l_rows  ! last local row number for multiply
+            if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
+            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
+
+            lcs = 1       ! 1st local col number for multiply
+            lce = l_cols  ! last local col number for multiply
+            if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
+            if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
+
+            if (lcs<=lce) then
+              allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage)
+              if (istat .ne. 0) then
+               print *,"elpa_mult_at_b_&
+               &MATH_DATATYPE&
+               &: error when allocating tmp1 "//errorMessage
+               stop 1
+              endif
+
+              if (lrs<=lre) then
+                call obj%timer%start("blas")
+                call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', int(nstor,kind=BLAS_KIND), &
+                                    int(lce-lcs+1,kind=BLAS_KIND), int(lre-lrs+1,kind=BLAS_KIND), &
+                                    ONE, aux_mat(lrs,1), int(ubound(aux_mat,dim=1),kind=BLAS_KIND), &
+                                    b(lrs,lcs), int(ldb,kind=BLAS_KIND), ZERO, tmp1, &
+                                    int(nstor,kind=BLAS_KIND))
+                call obj%timer%stop("blas")
+              else
+                tmp1 = 0
+              endif
+
+              ! Sum up the results and send to processor row np
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=MPI_KIND),  MPI_MATH_DATATYPE_PRECISION, &
+                              MPI_SUM, int(np,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+              call obj%timer%stop("mpi_communication")
+              ! Put the result into C
+              if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
+
+#else /* WITH_MPI */
+!              tmp2 = tmp1
+              ! Put the result into C
+              if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
+
+#endif /* WITH_MPI */
+
+              deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
+              if (istat .ne. 0) then
+               print *,"elpa_mult_at_b_&
+               &MATH_DATATYPE&
+               &: error when deallocating tmp1 "//errorMessage
+               stop 1
+              endif
+
+            endif
+
+            nr_done = nr_done+nstor
+            nstor=0
+            aux_mat(:,:)=0
+          endif
+        enddo
+      enddo
+
+      deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+       print *,"elpa_mult_at_b_&
+       &MATH_DATATYPE&
+       &: error when deallocating aux_mat "//errorMessage
+       stop 1
+      endif
+
+      call obj%timer%stop("elpa_mult_at_b_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &")
+
diff -Nru elpa-2016.05.001/src/elpa1/elpa_reduce_add_vectors.F90 elpa-2019.11.001/src/elpa1/elpa_reduce_add_vectors.F90
--- elpa-2016.05.001/src/elpa1/elpa_reduce_add_vectors.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa_reduce_add_vectors.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,219 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: Andreas Marek, MPCDF
+#endif
+
+#include "config-f90.h"
+#include "../general/sanity.F90"
+
+subroutine elpa_reduce_add_vectors_&
+&MATH_DATATYPE&
+&_&
+&PRECISION &
+            (obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvr, nvc, nblk, nrThreads)
+
+!-------------------------------------------------------------------------------
+! This routine does a reduce of all vectors in vmat_s over the communicator comm_t.
+! The result of the reduce is gathered on the processors owning the diagonal
+! and added to the array of vectors vmat_t (which is distributed over comm_t).
+!
+! Opposed to elpa_transpose_vectors, there is NO identical copy of vmat_s
+! in the different members within vmat_t (else a reduce wouldn't be necessary).
+! After this routine, an allreduce of vmat_t has to be done.
+!
+! vmat_s    array of vectors to be reduced and added
+! ld_s      leading dimension of vmat_s
+! comm_s    communicator over which vmat_s is distributed
+! vmat_t    array of vectors to which vmat_s is added
+! ld_t      leading dimension of vmat_t
+! comm_t    communicator over which vmat_t is distributed
+! nvr       global length of vmat_s/vmat_t
+! nvc       number of columns in vmat_s/vmat_t
+! nblk      block size of block cyclic distribution
+!
+!-------------------------------------------------------------------------------
+
+   use precision
+#ifdef WITH_OPENMP
+   use omp_lib
+#endif
+   use elpa_mpi
+   use elpa_abstract_impl
+   implicit none
+
+   class(elpa_abstract_impl_t), intent(inout)         :: obj
+   integer(kind=ik), intent(in)                       :: ld_s, comm_s, ld_t, comm_t, nvr, nvc, nblk
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(in)    :: vmat_s(ld_s,nvc)
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: vmat_t(ld_t,nvc)
+
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable   :: aux1(:), aux2(:)
+   integer(kind=ik)                                   :: myps, mypt, nps, npt
+   integer(kind=MPI_KIND)                             :: mypsMPI, npsMPI, myptMPI, nptMPI
+   integer(kind=ik)                                   :: n, lc, k, i, ips, ipt, ns, nl
+   integer(kind=MPI_KIND)                             :: mpierr
+   integer(kind=ik)                                   :: lcm_s_t, nblks_tot
+   integer(kind=ik)                                   :: auxstride
+   integer(kind=ik), intent(in)                       :: nrThreads
+
+
+   call obj%timer%start("elpa_reduce_add_vectors_&
+   &MATH_DATATYPE&
+   &" // &
+   &PRECISION_SUFFIX &
+   )
+
+   call obj%timer%start("mpi_communication")
+   call mpi_comm_rank(int(comm_s,kind=MPI_KIND), mypsMPI, mpierr)
+   call mpi_comm_size(int(comm_s,kind=MPI_KIND), npsMPI,  mpierr)
+   call mpi_comm_rank(int(comm_t,kind=MPI_KIND), myptMPI, mpierr)
+   call mpi_comm_size(int(comm_t,kind=MPI_KIND), nptMPI ,mpierr)
+   myps = int(mypsMPI,kind=c_int)
+   nps = int(npsMPI,kind=c_int)
+   mypt = int(myptMPI,kind=c_int)
+   npt = int(nptMPI,kind=c_int)
+
+   call obj%timer%stop("mpi_communication")
+
+   ! Look to elpa_transpose_vectors for the basic idea!
+
+   ! The communictation pattern repeats in the global matrix after
+   ! the least common multiple of (nps,npt) blocks
+
+   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt
+
+   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr
+
+   allocate(aux1( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
+   allocate(aux2( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
+   aux1(:) = 0
+   aux2(:) = 0
+#ifdef WITH_OPENMP
+   !call omp_set_num_threads(nrThreads)
+
+   !$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
+#endif
+   do n = 0, lcm_s_t-1
+
+      ips = mod(n,nps)
+      ipt = mod(n,npt)
+
+      auxstride = nblk * ((nblks_tot - n + lcm_s_t - 1)/lcm_s_t)
+
+      if(myps == ips) then
+
+!         k = 0
+#ifdef WITH_OPENMP
+         !$omp do
+#endif
+         do lc=1,nvc
+            do i = n, nblks_tot-1, lcm_s_t
+               k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
+               ns = (i/nps)*nblk ! local start of block i
+               nl = min(nvr-i*nblk,nblk) ! length
+               aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
+!               k = k+nblk
+            enddo
+         enddo
+
+         k = nvc * auxstride
+#ifdef WITH_OPENMP
+         !$omp barrier
+         !$omp master
+#endif
+
+#ifdef WITH_MPI
+         call obj%timer%start("mpi_communication")
+
+         if (k>0) call mpi_reduce(aux1, aux2, k, &
+#if REALCASE == 1
+                                  MPI_REAL_PRECISION,  &
+#endif
+#if COMPLEXCASE == 1
+                                  MPI_COMPLEX_PRECISION, &
+#endif
+                                  MPI_SUM, int(ipt,kind=MPI_KIND), int(comm_t,kind=MPI_KIND), mpierr)
+
+          call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+         if(k>0) aux2 = aux1
+#endif /* WITH_MPI */
+
+#ifdef WITH_OPENMP
+         !$omp end master
+         !$omp barrier
+#endif
+         if (mypt == ipt) then
+!            k = 0
+#ifdef WITH_OPENMP
+         !$omp do
+#endif
+            do lc=1,nvc
+               do i = n, nblks_tot-1, lcm_s_t
+                  k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
+                  ns = (i/npt)*nblk ! local start of block i
+                  nl = min(nvr-i*nblk,nblk) ! length
+                  vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl)
+!                  k = k+nblk
+               enddo
+            enddo
+         endif
+
+      endif
+
+   enddo
+#ifdef WITH_OPENMP
+   !$omp end parallel
+#endif
+
+   deallocate(aux1)
+   deallocate(aux2)
+
+   call obj%timer%stop("elpa_reduce_add_vectors_&
+   &MATH_DATATYPE&
+   &" // &
+   &PRECISION_SUFFIX &
+   )
+end subroutine
+
+
diff -Nru elpa-2016.05.001/src/elpa1/elpa_solve_tridi_impl_public.F90 elpa-2019.11.001/src/elpa1/elpa_solve_tridi_impl_public.F90
--- elpa-2016.05.001/src/elpa1/elpa_solve_tridi_impl_public.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa_solve_tridi_impl_public.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,156 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: A. Marek, MPCDF
+
+
+
+#include "../general/sanity.F90"
+
+      use elpa1_compute, solve_tridi_&
+                         &PRECISION&
+                         &_private_impl => solve_tridi_&
+                         &PRECISION&
+                         &_impl
+      use precision
+      use elpa_abstract_impl
+      use elpa_omp
+
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik)         :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
+      real(kind=REAL_DATATYPE) :: d(obj%na), e(obj%na)
+#ifdef USE_ASSUMED_SIZE
+      real(kind=REAL_DATATYPE) :: q(obj%local_nrows,*)
+#else
+      real(kind=REAL_DATATYPE) :: q(obj%local_nrows, obj%local_ncols)
+#endif
+
+      logical                  :: wantDebug
+      logical                  :: success
+
+      integer                  :: debug, error
+      integer                  :: nrThreads
+
+      call obj%timer%start("elpa_solve_tridi_public_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &")
+      na         = obj%na
+      nev        = obj%nev
+      nblk       = obj%nblk
+      ldq        = obj%local_nrows
+      matrixCols = obj%local_ncols
+
+#ifdef WITH_OPENMP
+      ! store the number of OpenMP threads used in the calling function
+      ! restore this at the end of ELPA 2 
+      omp_threads_caller = omp_get_max_threads()
+
+      ! check the number of threads that ELPA should use internally
+
+      call obj%get("omp_threads",nrThreads,error)
+#else
+      nrThreads=1
+#endif
+
+      call obj%get("mpi_comm_rows", mpi_comm_rows,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+      call obj%get("mpi_comm_cols", mpi_comm_cols,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+
+      call obj%get("debug",debug,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+      if (debug == 1) then
+        wantDebug = .true.
+      else
+        wantDebug = .false.
+      endif
+      success = .false.
+
+      call solve_tridi_&
+      &PRECISION&
+      &_private_impl(obj, na, nev, d, e, q, ldq, nblk, matrixCols, &
+               mpi_comm_rows, mpi_comm_cols,.false., wantDebug, success, &
+               nrThreads)
+
+
+      ! restore original OpenMP settings
+#ifdef WITH_OPENMP
+      ! store the number of OpenMP threads used in the calling function
+      ! restore this at the end of ELPA 2
+      call omp_set_num_threads(omp_threads_caller)
+#endif
+
+
+      call obj%timer%stop("elpa_solve_tridi_public_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &")
+
+
+#undef REALCASE
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa1/elpa_transpose_vectors.F90 elpa-2019.11.001/src/elpa1/elpa_transpose_vectors.F90
--- elpa-2016.05.001/src/elpa1/elpa_transpose_vectors.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa1/elpa_transpose_vectors.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,232 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+! Author: Andreas Marek, MPCDF
+#endif
+
+#include "config-f90.h"
+#include "../general/sanity.F90"
+
+#undef ROUTINE_NAME
+#ifdef SKEW_SYMMETRIC_BUILD
+#define ROUTINE_NAME elpa_transpose_vectors_ss_
+#else
+#define ROUTINE_NAME elpa_transpose_vectors_
+#endif
+
+
+subroutine ROUTINE_NAME&
+&MATH_DATATYPE&
+&_&
+&PRECISION &
+             (obj, vmat_s, ld_s, comm_s, vmat_t, ld_t, comm_t, nvs, nvr, nvc, nblk, nrThreads)
+
+!-------------------------------------------------------------------------------
+! This routine transposes an array of vectors which are distributed in
+! communicator comm_s into its transposed form distributed in communicator comm_t.
+! There must be an identical copy of vmat_s in every communicator comm_s.
+! After this routine, there is an identical copy of vmat_t in every communicator comm_t.
+!
+! vmat_s    original array of vectors
+! ld_s      leading dimension of vmat_s
+! comm_s    communicator over which vmat_s is distributed
+! vmat_t    array of vectors in transposed form
+! ld_t      leading dimension of vmat_t
+! comm_t    communicator over which vmat_t is distributed
+! nvs       global index where to start in vmat_s/vmat_t
+!           Please note: this is kind of a hint, some values before nvs will be
+!           accessed in vmat_s/put into vmat_t
+! nvr       global length of vmat_s/vmat_t
+! nvc       number of columns in vmat_s/vmat_t
+! nblk      block size of block cyclic distribution
+!
+!-------------------------------------------------------------------------------
+    use precision
+    use elpa_abstract_impl
+#ifdef WITH_OPENMP
+   use omp_lib
+#endif
+   use elpa_mpi
+
+   implicit none
+   class(elpa_abstract_impl_t), intent(inout) :: obj
+   integer(kind=ik), intent(in)                      :: ld_s, comm_s, ld_t, comm_t, nvs, nvr, nvc, nblk
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(in)   :: vmat_s(ld_s,nvc)
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout):: vmat_t(ld_t,nvc)
+
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable  :: aux(:)
+   integer(kind=ik)                                  :: myps, mypt, nps, npt
+   integer(kind=MPI_KIND)                            :: mypsMPI, myptMPI, npsMPI, nptMPI
+   integer(kind=ik)                                  :: n, lc, k, i, ips, ipt, ns, nl
+   integer(kind=MPI_KIND)                            :: mpierr
+   integer(kind=ik)                                  :: lcm_s_t, nblks_tot, nblks_comm, nblks_skip
+   integer(kind=ik)                                  :: auxstride
+   integer(kind=ik), intent(in)                      :: nrThreads
+
+   call obj%timer%start("ROUTINE_NAME&
+   &MATH_DATATYPE&
+   &" // &
+   &PRECISION_SUFFIX &
+   )
+
+   call obj%timer%start("mpi_communication")
+   call mpi_comm_rank(int(comm_s,kind=MPI_KIND),mypsMPI, mpierr)
+   call mpi_comm_size(int(comm_s,kind=MPI_KIND),npsMPI ,mpierr)
+   call mpi_comm_rank(int(comm_t,kind=MPI_KIND),myptMPI, mpierr)
+   call mpi_comm_size(int(comm_t,kind=MPI_KIND),nptMPI ,mpierr)
+   myps = int(mypsMPI,kind=c_int)
+   nps = int(npsMPI,kind=c_int)
+   mypt = int(myptMPI,kind=c_int)
+   npt = int(nptMPI,kind=c_int)
+
+
+   call obj%timer%stop("mpi_communication")
+   ! The basic idea of this routine is that for every block (in the block cyclic
+   ! distribution), the processor within comm_t which owns the diagonal
+   ! broadcasts its values of vmat_s to all processors within comm_t.
+   ! Of course this has not to be done for every block separately, since
+   ! the communictation pattern repeats in the global matrix after
+   ! the least common multiple of (nps,npt) blocks
+
+   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt
+
+   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr
+
+   ! Get the number of blocks to be skipped at the begin.
+   ! This must be a multiple of lcm_s_t (else it is getting complicated),
+   ! thus some elements before nvs will be accessed/set.
+
+   nblks_skip = ((nvs-1)/(nblk*lcm_s_t))*lcm_s_t
+
+   allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
+#ifdef WITH_OPENMP
+   !$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
+#endif
+   do n = 0, lcm_s_t-1
+
+      ips = mod(n,nps)
+      ipt = mod(n,npt)
+
+      if(mypt == ipt) then
+
+         nblks_comm = (nblks_tot-nblks_skip-n+lcm_s_t-1)/lcm_s_t
+         auxstride = nblk * nblks_comm
+!         if(nblks_comm==0) cycle
+         if (nblks_comm .ne. 0) then
+         if(myps == ips) then
+!            k = 0
+#ifdef WITH_OPENMP
+            !$omp do
+#endif
+            do lc=1,nvc
+               do i = nblks_skip+n, nblks_tot-1, lcm_s_t
+                  k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
+                  ns = (i/nps)*nblk ! local start of block i
+                  nl = min(nvr-i*nblk,nblk) ! length
+                  aux(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
+!                  k = k+nblk
+               enddo
+            enddo
+         endif
+
+#ifdef WITH_OPENMP
+         !$omp barrier
+         !$omp master
+#endif
+
+#ifdef WITH_MPI
+        call obj%timer%start("mpi_communication")
+
+        call MPI_Bcast(aux, int(nblks_comm*nblk*nvc,kind=MPI_KIND),    &
+#if REALCASE == 1
+                       MPI_REAL_PRECISION,    &
+#endif
+#if COMPLEXCASE == 1
+                       MPI_COMPLEX_PRECISION, &
+#endif
+                       int(ips,kind=MPI_KIND), int(comm_s,kind=MPI_KIND), mpierr)
+
+
+         call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+#ifdef WITH_OPENMP
+         !$omp end master
+         !$omp barrier
+
+         !$omp do
+#endif
+!         k = 0
+         do lc=1,nvc
+            do i = nblks_skip+n, nblks_tot-1, lcm_s_t
+               k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
+               ns = (i/npt)*nblk ! local start of block i
+               nl = min(nvr-i*nblk,nblk) ! length
+#ifdef SKEW_SYMMETRIC_BUILD
+               vmat_t(ns+1:ns+nl,lc) = - aux(k+1:k+nl)
+#else
+               vmat_t(ns+1:ns+nl,lc) = aux(k+1:k+nl)
+#endif
+!               k = k+nblk
+            enddo
+         enddo
+         endif
+      endif
+
+   enddo
+#ifdef WITH_OPENMP
+   !$omp end parallel
+#endif
+   deallocate(aux)
+
+   call obj%timer%stop("ROUTINE_NAME&
+   &MATH_DATATYPE&
+   &" // &
+   &PRECISION_SUFFIX &
+   )
+
+end subroutine
+
diff -Nru elpa-2016.05.001/src/elpa1_compute.F90 elpa-2019.11.001/src/elpa1_compute.F90
--- elpa-2016.05.001/src/elpa1_compute.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa1_compute.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,4632 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!    This particular source code file contains additions, changes and
-!    enhancements authored by Intel Corporation which is not part of
-!    the ELPA consortium.
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-#include "config-f90.h"
-
-module ELPA1_compute
-  use elpa_utilities
-#ifdef HAVE_DETAILED_TIMINGS
-  use timings
-#endif
-  use elpa_mpi
-  implicit none
-
-  PRIVATE ! set default to private
-
-  public :: tridiag_real               ! Transform real symmetric matrix to tridiagonal form
-  public :: trans_ev_real              ! Transform eigenvectors of a tridiagonal matrix back
-  public :: mult_at_b_real             ! Multiply real matrices A**T * B
-
-  public :: tridiag_complex            ! Transform complex hermitian matrix to tridiagonal form
-  public :: trans_ev_complex           ! Transform eigenvectors of a tridiagonal matrix back
-  public :: mult_ah_b_complex          ! Multiply complex matrices A**H * B
-
-  public :: solve_tridi                ! Solve tridiagonal eigensystem with divide and conquer method
-
-  public :: cholesky_real              ! Cholesky factorization of a real matrix
-  public :: invert_trm_real            ! Invert real triangular matrix
-
-  public :: cholesky_complex           ! Cholesky factorization of a complex matrix
-  public :: invert_trm_complex         ! Invert complex triangular matrix
-
-  public :: local_index                ! Get local index of a block cyclic distributed matrix
-  public :: least_common_multiple      ! Get least common multiple
-
-  public :: hh_transform_real
-  public :: hh_transform_complex
-
-  public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real
-  public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real
-
-  contains
-
-#define DATATYPE REAL(kind=rk)
-#define BYTESIZE 8
-#define REALCASE 1
-#include "elpa_transpose_vectors.X90"
-#include "elpa_reduce_add_vectors.X90"
-#undef DATATYPE
-#undef BYTESIZE
-#undef REALCASE
-
-    subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
-
-    !-------------------------------------------------------------------------------
-    !  tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form
-    !                (like Scalapack Routine PDSYTRD)
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be reduced.
-    !              Distribution is like in Scalapack.
-    !              Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
-    !              a(:,:) is overwritten on exit with the Householder vectors
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  d(na)       Diagonal elements (returned), identical on all processors
-    !
-    !  e(na)       Off-Diagonal elements (returned), identical on all processors
-    !
-    !  tau(na)     Factors for the Householder vectors (returned), needed for back transformation
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)            :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=rk)               :: d(na), e(na), tau(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)               :: a(lda,*)
-#else
-      real(kind=rk)               :: a(lda,matrixCols)
-#endif
-
-      integer(kind=ik), parameter :: max_stored_rows = 32
-
-      integer(kind=ik)            :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)            :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)            :: l_cols, l_rows, nstor
-      integer(kind=ik)            :: istep, i, j, lcs, lce, lrs, lre
-      integer(kind=ik)            :: tile_size, l_rows_tile, l_cols_tile
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)            :: my_thread, n_threads, max_threads, n_iter
-      integer(kind=ik)            :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
-#endif
-
-      real(kind=rk)               :: vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
-
-      real(kind=rk), allocatable  :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
-#ifdef WITH_OPENMP
-      real(kind=rk), allocatable  :: ur_p(:,:), uc_p(:,:)
-#endif
-      integer(kind=ik)            :: istat
-      character(200)              :: errorMessage
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("tridiag_real")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-
-      totalblocks = (na-1)/nblk + 1
-      max_blocks_row = (totalblocks-1)/np_rows + 1
-      max_blocks_col = (totalblocks-1)/np_cols + 1
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating tmp "//errorMessage
-        stop
-      endif
-
-      allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating vr "//errorMessage
-        stop
-      endif
-
-      allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating ur "//errorMessage
-        stop
-      endif
-
-      allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating vc "//errorMessage
-        stop
-      endif
-
-      allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating uc "//errorMessage
-        stop
-      endif
-
-#ifdef WITH_OPENMP
-      max_threads = omp_get_max_threads()
-
-      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating ur_p "//errorMessage
-        stop
-      endif
-
-      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating uc_p "//errorMessage
-        stop
-      endif
-
-#endif
-
-      tmp = 0
-      vr = 0
-      ur = 0
-      vc = 0
-      uc = 0
-
-      allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating vur "//errorMessage
-        stop
-      endif
-
-      allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating uvc "//errorMessage
-        stop
-      endif
-
-      d(:) = 0
-      e(:) = 0
-      tau(:) = 0
-
-      nstor = 0
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-      if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols)
-
-      do istep=na,3,-1
-
-         ! Calculate number of local rows and columns of the still remaining matrix
-         ! on the local processor
-
-         l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
-         l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
-
-         ! Calculate vector for Householder transformation on all procs
-         ! owning column istep
-
-         if(my_pcol==pcol(istep, nblk, np_cols)) then
-
-            ! Get vector to be transformed; distribute last element and norm of
-            ! remaining elements to all procs in current column
-
-            vr(1:l_rows) = a(1:l_rows,l_cols+1)
-            if(nstor>0 .and. l_rows>0) then
-               call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), &
-                          uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1)
-            endif
-
-            if(my_prow==prow(istep-1, nblk, np_rows)) then
-               aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1))
-               aux1(2) = vr(l_rows)
-            else
-               aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
-               aux1(2) = 0.
-            endif
-
-#ifdef WITH_MPI
-            call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-            aux2 = aux1
-#endif
-
-            vnorm2 = aux2(1)
-            vrl    = aux2(2)
-
-            ! Householder transformation
-
-            call hh_transform_real(vrl, vnorm2, xf, tau(istep))
-
-            ! Scale vr and store Householder vector for back transformation
-
-            vr(1:l_rows) = vr(1:l_rows) * xf
-            if(my_prow==prow(istep-1, nblk, np_rows)) then
-               vr(l_rows) = 1.
-               e(istep-1) = vrl
-            endif
-            a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation
-
-         endif
-
-         ! Broadcast the Householder vector (and tau) along columns
-
-         if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
-#ifdef WITH_MPI
-         call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-         tau(istep) =  vr(l_rows+1)
-
-         ! Transpose Householder vector vr -> vc
-
-         call elpa_transpose_vectors_real  (vr, ubound(vr,dim=1), mpi_comm_rows, &
-                                            vc, ubound(vc,dim=1), mpi_comm_cols, &
-                                            1, istep-1, 1, nblk)
-
-
-         ! Calculate u = (A + VU**T + UV**T)*v
-
-         ! For cache efficiency, we use only the upper half of the matrix tiles for this,
-         ! thus the result is partly in uc(:) and partly in ur(:)
-
-         uc(1:l_cols) = 0
-         ur(1:l_rows) = 0
-         if (l_rows>0 .and. l_cols>0) then
-
-#ifdef WITH_OPENMP
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%start("OpenMP parallel")
-#endif
-
-!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
-
-           my_thread = omp_get_thread_num()
-           n_threads = omp_get_num_threads()
-
-           n_iter = 0
-
-           uc_p(1:l_cols,my_thread) = 0.
-           ur_p(1:l_rows,my_thread) = 0.
-#endif
-           do i=0,(istep-2)/tile_size
-             lcs = i*l_cols_tile+1
-             lce = min(l_cols,(i+1)*l_cols_tile)
-             if (lce<lcs) cycle
-             do j=0,i
-               lrs = j*l_rows_tile+1
-               lre = min(l_rows,(j+1)*l_rows_tile)
-               if (lre<lrs) cycle
-#ifdef WITH_OPENMP
-               if (mod(n_iter,n_threads) == my_thread) then
-                 call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc_p(lcs,my_thread),1)
-                 if (i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur_p(lrs,my_thread),1)
-               endif
-               n_iter = n_iter+1
-#else
-               call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc(lcs),1)
-               if (i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur(lrs),1)
-
-#endif
-             enddo
-           enddo
-#ifdef WITH_OPENMP
-!$OMP END PARALLEL
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%stop("OpenMP parallel")
-#endif
-
-           do i=0,max_threads-1
-             uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
-             ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
-           enddo
-#endif
-           if (nstor>0) then
-             call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1)
-             call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1)
-           endif
-
-         endif
-
-        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
-        ! on the processors containing the diagonal
-        ! This is only necessary if ur has been calculated, i.e. if the
-        ! global tile size is smaller than the global remaining matrix
-
-        if (tile_size < istep-1) then
-          call elpa_reduce_add_vectors_REAL  (ur, ubound(ur,dim=1), mpi_comm_rows, &
-                                        uc, ubound(uc,dim=1), mpi_comm_cols, &
-                                        istep-1, 1, nblk)
-        endif
-
-        ! Sum up all the uc(:) parts, transpose uc -> ur
-
-        if (l_cols>0) then
-          tmp(1:l_cols) = uc(1:l_cols)
-#ifdef WITH_MPI
-          call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-          uc = tmp
-#endif
-        endif
-
-        call elpa_transpose_vectors_real  (uc, ubound(uc,dim=1), mpi_comm_cols, &
-                                         ur, ubound(ur,dim=1), mpi_comm_rows, &
-                                         1, istep-1, 1, nblk)
-
-        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
-
-        x = 0
-        if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols))
-#ifdef WITH_MPI
-        call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-#else
-        vav = x
-#endif
-        ! store u and v in the matrices U and V
-        ! these matrices are stored combined in one here
-
-        do j=1,l_rows
-          vur(j,2*nstor+1) = tau(istep)*vr(j)
-          vur(j,2*nstor+2) = 0.5*tau(istep)*vav*vr(j) - ur(j)
-        enddo
-        do j=1,l_cols
-          uvc(j,2*nstor+1) = 0.5*tau(istep)*vav*vc(j) - uc(j)
-          uvc(j,2*nstor+2) = tau(istep)*vc(j)
-        enddo
-
-        nstor = nstor+1
-
-        ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T
-
-        if (nstor==max_stored_rows .or. istep==3) then
-
-          do i=0,(istep-2)/tile_size
-            lcs = i*l_cols_tile+1
-            lce = min(l_cols,(i+1)*l_cols_tile)
-           lrs = 1
-            lre = min(l_rows,(i+1)*l_rows_tile)
-            if (lce<lcs .or. lre<lrs) cycle
-            call dgemm('N','T',lre-lrs+1,lce-lcs+1,2*nstor,1.d0, &
-                       vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
-                       1.d0,a(lrs,lcs),lda)
-          enddo
-
-          nstor = 0
-
-        endif
-
-        if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then
-          if (nstor>0) a(l_rows,l_cols) = a(l_rows,l_cols) &
-                        + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor))
-          d(istep-1) = a(l_rows,l_cols)
-        endif
-
-      enddo
-
-      ! Store e(1) and d(1)
-
-      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above
-      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
-
-      deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when deallocating uvc "//errorMessage
-        stop
-      endif
-
-
-      ! distribute the arrays d and e to all processors
-
-      allocate(tmp(na),  stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when allocating tmp "//errorMessage
-        stop
-      endif
-#ifdef WITH_MPI
-      tmp = d
-      call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-      tmp = d
-      call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-      tmp = e
-      call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-      tmp = e
-      call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-#endif
-      deallocate(tmp,  stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"tridiag_real: error when deallocating tmp "//errorMessage
-        stop
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("tridiag_real")
-#endif
-
-    end subroutine tridiag_real
-
-    subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    !-------------------------------------------------------------------------------
-    !  trans_ev_real: Transforms the eigenvectors of a tridiagonal matrix back
-    !                 to the eigenvectors of the original matrix
-    !                 (like Scalapack Routine PDORMTR)
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nqc         Number of columns of matrix q
-    !
-    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
-    !              Distribution is like in Scalapack.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a and q
-    !
-    !  tau(na)     Factors of the Householder vectors
-    !
-    !  q           On input: Eigenvectors of tridiagonal matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)           :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=rk)              :: tau(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)              :: a(lda,*), q(ldq,*)
-#else
-      real(kind=rk)              :: a(lda,matrixCols), q(ldq,matrixCols)
-#endif
-
-      integer(kind=ik)           :: max_stored_rows
-
-      integer(kind=ik)           :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)           :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)           :: l_cols, l_rows, l_colh, nstor
-      integer(kind=ik)           :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
-
-      real(kind=rk), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
-      real(kind=rk), allocatable :: tmat(:,:), h1(:), h2(:)
-      integer(kind=ik)           :: istat
-      character(200)             :: errorMessage
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_real")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      totalblocks = (na-1)/nblk + 1
-      max_blocks_row = (totalblocks-1)/np_rows + 1
-      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      max_stored_rows = (63/nblk+1)*nblk
-
-      allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"trans_ev_real: error when allocating tmat "//errorMessage
-        stop
-      endif
-
-      allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"trans_ev_real: error when allocating h1 "//errorMessage
-        stop
-      endif
-
-      allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"trans_ev_real: error when allocating h2 "//errorMessage
-        stop
-      endif
-
-      allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"trans_ev_real: error when allocating tmp1 "//errorMessage
-        stop
-      endif
-
-      allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"trans_ev_real: error when allocating tmp2 "//errorMessage
-        stop
-      endif
-
-      allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"trans_ev_real: error when allocating hvn "//errorMessage
-        stop
-      endif
-
-      allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"trans_ev_real: error when allocating hvm "//errorMessage
-        stop
-      endif
-
-      hvm = 0   ! Must be set to 0 !!!
-      hvb = 0   ! Safety only
-
-      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-      nstor = 0
-
-      do istep=1,na,nblk
-
-        ics = MAX(istep,3)
-        ice = MIN(istep+nblk-1,na)
-        if (ice<ics) cycle
-
-        cur_pcol = pcol(istep, nblk, np_cols)
-
-        nb = 0
-        do ic=ics,ice
-
-          l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder vector
-          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
-
-
-          if (my_pcol==cur_pcol) then
-            hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-            if (my_prow==prow(ic-1, nblk, np_rows)) then
-              hvb(nb+l_rows) = 1.
-            endif
-          endif
-
-          nb = nb+l_rows
-        enddo
-
-#ifdef WITH_MPI
-        if (nb>0) &
-            call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
-#endif
-        nb = 0
-        do ic=ics,ice
-          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
-          hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
-          nstor = nstor+1
-          nb = nb+l_rows
-        enddo
-
-        ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
-        if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then
-
-          ! Calculate scalar products of stored vectors.
-          ! This can be done in different ways, we use dsyrk
-
-          tmat = 0
-          if (l_rows>0) &
-               call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows)
-
-          nc = 0
-          do n=1,nstor-1
-            h1(nc+1:nc+n) = tmat(1:n,n+1)
-            nc = nc+n
-          enddo
-#ifdef WITH_MPI
-          if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-          if (nc>0) h2 = h1
-#endif
-          ! Calculate triangular matrix T
-
-          nc = 0
-          tmat(1,1) = tau(ice-nstor+1)
-          do n=1,nstor-1
-            call dtrmv('L','T','N',n,tmat,max_stored_rows,h2(nc+1),1)
-            tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
-            tmat(n+1,n+1) = tau(ice-nstor+n+1)
-            nc = nc+n
-          enddo
-
-          ! Q = Q - V * T * V**T * Q
-
-          if (l_rows>0) then
-            call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
-                          q,ldq,0.d0,tmp1,nstor)
-          else
-            tmp1(1:l_cols*nstor) = 0
-          endif
-#ifdef WITH_MPI
-          call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-          tmp2 = tmp1
-#endif
-          if (l_rows>0) then
-            call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor)
-            call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), &
-                          tmp2,nstor,1.d0,q,ldq)
-          endif
-          nstor = 0
-        endif
-
-      enddo
-
-      deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"trans_ev_real: error when deallocating hvm "//errorMessage
-        stop
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_real")
-#endif
-
-    end subroutine trans_ev_real
-
-    subroutine mult_at_b_real(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_comm_rows, mpi_comm_cols, c, ldc)
-
-    !-------------------------------------------------------------------------------
-    !  mult_at_b_real:  Performs C := A**T * B
-    !
-    !      where:  A is a square matrix (na,na) which is optionally upper or lower triangular
-    !              B is a (na,ncb) matrix
-    !              C is a (na,ncb) matrix where optionally only the upper or lower
-    !              triangle may be computed
-    !
-    !  Parameters
-    !
-    !  uplo_a      'U' if A is upper triangular
-    !              'L' if A is lower triangular
-    !              anything else if A is a full matrix
-    !              Please note: This pertains to the original A (as set in the calling program)
-    !              whereas the transpose of A is used for calculations
-    !              If uplo_a is 'U' or 'L', the other triangle is not used at all,
-    !              i.e. it may contain arbitrary numbers
-    !
-    !  uplo_c      'U' if only the upper diagonal part of C is needed
-    !              'L' if only the upper diagonal part of C is needed
-    !              anything else if the full matrix C is needed
-    !              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
-    !              written to a certain extent, i.e. one shouldn't rely on the content there!
-    !
-    !  na          Number of rows/columns of A, number of rows of B and C
-    !
-    !  ncb         Number of columns  of B and C
-    !
-    !  a           Matrix A
-    !
-    !  lda         Leading dimension of a
-    !
-    !  b           Matrix B
-    !
-    !  ldb         Leading dimension of b
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  c           Matrix C
-    !
-    !  ldc         Leading dimension of c
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      character*1                   :: uplo_a, uplo_c
-
-      integer(kind=ik)              :: na, ncb, lda, ldb, nblk, mpi_comm_rows, mpi_comm_cols, ldc
-      real(kind=rk)                 :: a(lda,*), b(ldb,*), c(ldc,*) ! remove assumed size!
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: l_cols, l_rows, l_rows_np
-      integer(kind=ik)              :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
-      integer(kind=ik)              :: gcol_min, gcol, goff
-      integer(kind=ik)              :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
-      integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
-
-      logical                       :: a_lower, a_upper, c_lower, c_upper
-
-      real(kind=rk), allocatable    :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:)
-      integer(kind=ik)              :: istat
-      character(200)                :: errorMessage
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("mult_at_b_real")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      l_rows = local_index(na,  my_prow, np_rows, nblk, -1) ! Local rows of a and b
-      l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
-
-      ! Block factor for matrix multiplications, must be a multiple of nblk
-
-      if (na/np_rows<=256) then
-         nblk_mult = (31/nblk+1)*nblk
-      else
-         nblk_mult = (63/nblk+1)*nblk
-      endif
-
-      allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"mult_at_b_real: error when allocating aux_mat "//errorMessage
-        stop
-      endif
-
-      allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"mult_at_b_real: error when allocating aux_bc "//errorMessage
-        stop
-      endif
-
-      allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"mult_at_b_real: error when allocating lrs_save "//errorMessage
-        stop
-      endif
-
-      allocate(lre_save(nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"mult_at_b_real: error when allocating lre_save "//errorMessage
-        stop
-      endif
-
-      a_lower = .false.
-      a_upper = .false.
-      c_lower = .false.
-      c_upper = .false.
-
-      if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
-      if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
-      if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
-      if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
-
-      ! Build up the result matrix by processor rows
-
-      do np = 0, np_rows-1
-
-        ! In this turn, procs of row np assemble the result
-
-        l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
-
-        nr_done = 0 ! Number of rows done
-        aux_mat = 0
-        nstor = 0   ! Number of columns stored in aux_mat
-
-        ! Loop over the blocks on row np
-
-        do nb=0,(l_rows_np-1)/nblk
-
-          goff  = nb*np_rows + np ! Global offset in blocks corresponding to nb
-
-          ! Get the processor column which owns this block (A is transposed, so we need the column)
-          ! and the offset in blocks within this column.
-          ! The corresponding block column in A is then broadcast to all for multiplication with B
-
-          np_bc = MOD(goff,np_cols)
-          noff = goff/np_cols
-          n_aux_bc = 0
-
-          ! Gather up the complete block column of A on the owner
-
-          do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast
-
-            gcol = goff*nblk + n ! global column corresponding to n
-            if (nstor==0 .and. n==1) gcol_min = gcol
-
-            lrs = 1       ! 1st local row number for broadcast
-            lre = l_rows  ! last local row number for broadcast
-            if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
-            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
-
-            if (lrs<=lre) then
-              nvals = lre-lrs+1
-              if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
-              n_aux_bc = n_aux_bc + nvals
-            endif
-
-            lrs_save(n) = lrs
-            lre_save(n) = lre
-
-          enddo
-
-          ! Broadcast block column
-#ifdef WITH_MPI
-          call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr)
-#endif
-          ! Insert what we got in aux_mat
-
-          n_aux_bc = 0
-          do n = 1, min(l_rows_np-nb*nblk,nblk)
-            nstor = nstor+1
-            lrs = lrs_save(n)
-            lre = lre_save(n)
-            if (lrs<=lre) then
-              nvals = lre-lrs+1
-              aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
-              n_aux_bc = n_aux_bc + nvals
-            endif
-          enddo
-
-          ! If we got nblk_mult columns in aux_mat or this is the last block
-          ! do the matrix multiplication
-
-          if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then
-
-            lrs = 1       ! 1st local row number for multiply
-            lre = l_rows  ! last local row number for multiply
-            if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
-            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
-
-            lcs = 1       ! 1st local col number for multiply
-            lce = l_cols  ! last local col number for multiply
-            if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
-            if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
-
-            if (lcs<=lce) then
-              allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage)
-              if (istat .ne. 0) then
-               print *,"mult_at_b_real: error when allocating tmp1 "//errorMessage
-               stop
-              endif
-
-              if (lrs<=lre) then
-                call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), &
-                             b(lrs,lcs),ldb,0.d0,tmp1,nstor)
-              else
-                tmp1 = 0
-              endif
-
-              ! Sum up the results and send to processor row np
-#ifdef WITH_MPI
-              call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr)
-#else
-              tmp2 = tmp1
-#endif
-              ! Put the result into C
-              if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
-
-              deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
-              if (istat .ne. 0) then
-               print *,"mult_at_b_real: error when deallocating tmp1 "//errorMessage
-               stop
-              endif
-
-            endif
-
-            nr_done = nr_done+nstor
-            nstor=0
-            aux_mat(:,:)=0
-          endif
-        enddo
-      enddo
-
-      deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"mult_at_b_real: error when deallocating aux_mat "//errorMessage
-       stop
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("mult_at_b_real")
-#endif
-
-    end subroutine mult_at_b_real
-
-#define DATATYPE COMPLEX(kind=ck)
-#define BYTESIZE 16
-#define COMPLEXCASE 1
-#include "elpa_transpose_vectors.X90"
-#include "elpa_reduce_add_vectors.X90"
-#undef DATATYPE
-#undef BYTESIZE
-#undef COMPLEXCASE
-
-    subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
-
-    !-------------------------------------------------------------------------------
-    !  tridiag_complex: Reduces a distributed hermitian matrix to tridiagonal form
-    !                   (like Scalapack Routine PZHETRD)
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be reduced.
-    !              Distribution is like in Scalapack.
-    !              Opposed to PZHETRD, a(:,:) must be set completely (upper and lower half)
-    !              a(:,:) is overwritten on exit with the Householder vectors
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  d(na)       Diagonal elements (returned), identical on all processors
-    !
-    !  e(na)       Off-Diagonal elements (returned), identical on all processors
-    !
-    !  tau(na)     Factors for the Householder vectors (returned), needed for back transformation
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      complex(kind=ck)              :: tau(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: a(lda,*)
-#else
-      complex(kind=ck)              :: a(lda,matrixCols)
-#endif
-      real(kind=rk)                 :: d(na), e(na)
-
-      integer(kind=ik), parameter   :: max_stored_rows = 32
-
-      complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)              :: l_cols, l_rows, nstor
-      integer(kind=ik)              :: istep, i, j, lcs, lce, lrs, lre
-      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: my_thread, n_threads, max_threads, n_iter
-      integer(kind=ik)              :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
-#endif
-
-      real(kind=rk)                 :: vnorm2
-      complex(kind=ck)              :: vav, xc, aux(2*max_stored_rows),  aux1(2), aux2(2), vrl, xf
-
-      complex(kind=ck), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
-#ifdef WITH_OPENMP
-      complex(kind=ck), allocatable :: ur_p(:,:), uc_p(:,:)
-#endif
-      real(kind=rk), allocatable    :: tmpr(:)
-      integer(kind=ik)              :: istat
-      character(200)                :: errorMessage
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("tridiag_complex")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-
-      totalblocks = (na-1)/nblk + 1
-      max_blocks_row = (totalblocks-1)/np_rows + 1
-      max_blocks_col = (totalblocks-1)/np_cols + 1
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating tmp "//errorMessage
-       stop
-      endif
-
-      allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating vr "//errorMessage
-       stop
-      endif
-
-      allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating ur "//errorMessage
-       stop
-      endif
-
-      allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating vc "//errorMessage
-       stop
-      endif
-
-      allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating uc "//errorMessage
-       stop
-      endif
-
-#ifdef WITH_OPENMP
-      max_threads = omp_get_max_threads()
-
-      allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating ur_p "//errorMessage
-       stop
-      endif
-
-      allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating uc_p "//errorMessage
-       stop
-      endif
-#endif
-
-      tmp = 0
-      vr = 0
-      ur = 0
-      vc = 0
-      uc = 0
-
-      allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating vur "//errorMessage
-       stop
-      endif
-
-      allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating uvc "//errorMessage
-       stop
-      endif
-
-      d(:) = 0
-      e(:) = 0
-      tau(:) = 0
-
-      nstor = 0
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-      if (my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols)
-
-      do istep=na,3,-1
-
-        ! Calculate number of local rows and columns of the still remaining matrix
-        ! on the local processor
-
-        l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
-        l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
-
-        ! Calculate vector for Householder transformation on all procs
-        ! owning column istep
-
-        if (my_pcol==pcol(istep, nblk, np_cols)) then
-
-          ! Get vector to be transformed; distribute last element and norm of
-          ! remaining elements to all procs in current column
-
-          vr(1:l_rows) = a(1:l_rows,l_cols+1)
-          if (nstor>0 .and. l_rows>0) then
-            aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor))
-            call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1), &
-                        aux,1,CONE,vr,1)
-          endif
-
-          if (my_prow==prow(istep-1, nblk, np_rows)) then
-            aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1))
-            aux1(2) = vr(l_rows)
-          else
-            aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows))
-            aux1(2) = 0.
-          endif
-#ifdef WITH_MPI
-          call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-          aux2 = aux1
-#endif
-          vnorm2 = aux2(1)
-          vrl    = aux2(2)
-
-          ! Householder transformation
-
-          call hh_transform_complex(vrl, vnorm2, xf, tau(istep))
-
-          ! Scale vr and store Householder vector for back transformation
-
-          vr(1:l_rows) = vr(1:l_rows) * xf
-          if (my_prow==prow(istep-1, nblk, np_rows)) then
-            vr(l_rows) = 1.
-            e(istep-1) = vrl
-          endif
-          a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation
-
-        endif
-
-        ! Broadcast the Householder vector (and tau) along columns
-
-        if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
-#ifdef WITH_MPI
-        call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-        tau(istep) =  vr(l_rows+1)
-
-        ! Transpose Householder vector vr -> vc
-
-!        call elpa_transpose_vectors  (vr, 2*ubound(vr,dim=1), mpi_comm_rows, &
-!                                      vc, 2*ubound(vc,dim=1), mpi_comm_cols, &
-!                                      1, 2*(istep-1), 1, 2*nblk)
-
-        call elpa_transpose_vectors_complex  (vr, ubound(vr,dim=1), mpi_comm_rows, &
-                                              vc, ubound(vc,dim=1), mpi_comm_cols, &
-                                              1, (istep-1), 1, nblk)
-        ! Calculate u = (A + VU**T + UV**T)*v
-
-        ! For cache efficiency, we use only the upper half of the matrix tiles for this,
-        ! thus the result is partly in uc(:) and partly in ur(:)
-
-        uc(1:l_cols) = 0
-        ur(1:l_rows) = 0
-        if (l_rows>0 .and. l_cols>0) then
-
-#ifdef WITH_OPENMP
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%start("OpenMP parallel")
-#endif
-
-!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
-
-          my_thread = omp_get_thread_num()
-          n_threads = omp_get_num_threads()
-
-          n_iter = 0
-
-          uc_p(1:l_cols,my_thread) = 0.
-          ur_p(1:l_rows,my_thread) = 0.
-#endif
-
-          do i=0,(istep-2)/tile_size
-            lcs = i*l_cols_tile+1
-            lce = min(l_cols,(i+1)*l_cols_tile)
-            if (lce<lcs) cycle
-            do j=0,i
-              lrs = j*l_rows_tile+1
-              lre = min(l_rows,(j+1)*l_rows_tile)
-              if (lre<lrs) cycle
-#ifdef WITH_OPENMP
-              if (mod(n_iter,n_threads) == my_thread) then
-                call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc_p(lcs,my_thread),1)
-                if (i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur_p(lrs,my_thread),1)
-              endif
-              n_iter = n_iter+1
-#else
-              call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc(lcs),1)
-              if (i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur(lrs),1)
-#endif
-            enddo
-          enddo
-
-#ifdef WITH_OPENMP
-!$OMP END PARALLEL
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("OpenMP parallel")
-#endif
-
-          do i=0,max_threads-1
-            uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
-            ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
-          enddo
-#endif
-
-          if (nstor>0) then
-            call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1),vr,1,CZERO,aux,1)
-            call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,dim=1),aux,1,CONE,uc,1)
-          endif
-
-        endif
-
-        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
-        ! on the processors containing the diagonal
-        ! This is only necessary if ur has been calculated, i.e. if the
-        ! global tile size is smaller than the global remaining matrix
-
-        if (tile_size < istep-1) then
-          call elpa_reduce_add_vectors_COMPLEX  (ur, ubound(ur,dim=1), mpi_comm_rows, &
-                                          uc, ubound(uc,dim=1), mpi_comm_cols, &
-                                          (istep-1), 1, nblk)
-        endif
-
-        ! Sum up all the uc(:) parts, transpose uc -> ur
-
-        if (l_cols>0) then
-          tmp(1:l_cols) = uc(1:l_cols)
-#ifdef WITH_MPI
-          call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-          uc = tmp
-#endif
-        endif
-
-!        call elpa_transpose_vectors  (uc, 2*ubound(uc,dim=1), mpi_comm_cols, &
-!                                      ur, 2*ubound(ur,dim=1), mpi_comm_rows, &
-!                                      1, 2*(istep-1), 1, 2*nblk)
-
-        call elpa_transpose_vectors_complex  (uc, ubound(uc,dim=1), mpi_comm_cols, &
-                                              ur, ubound(ur,dim=1), mpi_comm_rows, &
-                                              1, (istep-1), 1, nblk)
-
-
-
-        ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
-
-        xc = 0
-        if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols))
-#ifdef WITH_MPI
-        call mpi_allreduce(xc,vav,1,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_cols,mpierr)
-#else
-        vav = xc
-#endif
-        ! store u and v in the matrices U and V
-        ! these matrices are stored combined in one here
-
-        do j=1,l_rows
-          vur(j,2*nstor+1) = conjg(tau(istep))*vr(j)
-          vur(j,2*nstor+2) = 0.5*conjg(tau(istep))*vav*vr(j) - ur(j)
-        enddo
-        do j=1,l_cols
-          uvc(j,2*nstor+1) = 0.5*conjg(tau(istep))*vav*vc(j) - uc(j)
-          uvc(j,2*nstor+2) = conjg(tau(istep))*vc(j)
-        enddo
-
-        nstor = nstor+1
-
-        ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T
-
-        if (nstor==max_stored_rows .or. istep==3) then
-
-          do i=0,(istep-2)/tile_size
-            lcs = i*l_cols_tile+1
-            lce = min(l_cols,(i+1)*l_cols_tile)
-            lrs = 1
-            lre = min(l_rows,(i+1)*l_rows_tile)
-            if (lce<lcs .or. lre<lrs) cycle
-            call ZGEMM('N','C',lre-lrs+1,lce-lcs+1,2*nstor,CONE, &
-                         vur(lrs,1),ubound(vur,dim=1),uvc(lcs,1),ubound(uvc,dim=1), &
-                         CONE,a(lrs,lcs),lda)
-          enddo
-
-          nstor = 0
-
-        endif
-
-        if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then
-          if (nstor>0) a(l_rows,l_cols) = a(l_rows,l_cols) &
-                          + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor))
-          d(istep-1) = a(l_rows,l_cols)
-        endif
-
-      enddo ! istep
-
-      ! Store e(1) and d(1)
-
-      if (my_pcol==pcol(2, nblk, np_cols)) then
-        if (my_prow==prow(1, nblk, np_rows)) then
-          ! We use last l_cols value of loop above
-          vrl = a(1,l_cols)
-          call hh_transform_complex(vrl, 0.d0, xf, tau(2))
-          e(1) = vrl
-          a(1,l_cols) = 1. ! for consistency only
-        endif
-#ifdef WITH_MPI
-        call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,prow(1, nblk, np_rows),mpi_comm_rows,mpierr)
-#endif
-      endif
-#ifdef WITH_MPI
-      call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,pcol(2, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-
-      if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
-
-      deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when deallocating tmp "//errorMessage
-       stop
-      endif
-      ! distribute the arrays d and e to all processors
-
-      allocate(tmpr(na), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when allocating tmpr "//errorMessage
-       stop
-      endif
-#ifdef WITH_MPI
-      tmpr = d
-      call mpi_allreduce(tmpr,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-      tmpr = d
-      call mpi_allreduce(tmpr,d,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-      tmpr = e
-      call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-      tmpr = e
-      call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr)
-#endif
-      deallocate(tmpr, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"tridiag_complex: error when deallocating tmpr "//errorMessage
-       stop
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("tridiag_complex")
-#endif
-
-    end subroutine tridiag_complex
-
-    subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    !-------------------------------------------------------------------------------
-    !  trans_ev_complex: Transforms the eigenvectors of a tridiagonal matrix back
-    !                    to the eigenvectors of the original matrix
-    !                    (like Scalapack Routine PZUNMTR)
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nqc         Number of columns of matrix q
-    !
-    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex)
-    !              Distribution is like in Scalapack.
-    !
-    !  lda         Leading dimension of a
-    !
-    !  tau(na)     Factors of the Householder vectors
-    !
-    !  q           On input: Eigenvectors of tridiagonal matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              ::  na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      complex(kind=ck)              ::  tau(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: a(lda,*), q(ldq,*)
-#else
-      complex(kind=ck)              ::  a(lda,matrixCols), q(ldq,matrixCols)
-#endif
-      integer(kind=ik)              :: max_stored_rows
-
-      complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)              :: l_cols, l_rows, l_colh, nstor
-      integer(kind=ik)              :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
-
-      complex(kind=ck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
-      complex(kind=ck), allocatable :: tmat(:,:), h1(:), h2(:)
-      integer(kind=ik)              :: istat
-      character(200)                :: errorMessage
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_complex")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-      totalblocks = (na-1)/nblk + 1
-      max_blocks_row = (totalblocks-1)/np_rows + 1
-      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      max_stored_rows = (63/nblk+1)*nblk
-
-      allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"trans_ev_complex: error when allocating tmat "//errorMessage
-       stop
-      endif
-
-      allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"trans_ev_complex: error when allocating h1 "//errorMessage
-       stop
-      endif
-
-      allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"trans_ev_complex: error when allocating h2 "//errorMessage
-       stop
-      endif
-
-      allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"trans_ev_complex: error when allocating tmp1 "//errorMessage
-       stop
-      endif
-
-      allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"trans_ev_complex: error when allocating tmp2 "//errorMessage
-       stop
-      endif
-
-      allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"trans_ev_complex: error when allocating hvb "//errorMessage
-       stop
-      endif
-
-      allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"trans_ev_complex: error when allocating hvm "//errorMessage
-       stop
-      endif
-
-      hvm = 0   ! Must be set to 0 !!!
-      hvb = 0   ! Safety only
-
-      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-      nstor = 0
-
-      ! In the complex case tau(2) /= 0
-      if (my_prow == prow(1, nblk, np_rows)) then
-        q(1,1:l_cols) = q(1,1:l_cols)*((1.d0,0.d0)-tau(2))
-      endif
-
-      do istep=1,na,nblk
-
-        ics = MAX(istep,3)
-        ice = MIN(istep+nblk-1,na)
-        if (ice<ics) cycle
-
-        cur_pcol = pcol(istep, nblk, np_cols)
-
-        nb = 0
-        do ic=ics,ice
-
-          l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder vector
-          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
-
-
-          if (my_pcol==cur_pcol) then
-            hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-            if (my_prow==prow(ic-1, nblk, np_rows)) then
-              hvb(nb+l_rows) = 1.
-            endif
-          endif
-
-          nb = nb+l_rows
-        enddo
-
-#ifdef WITH_MPI
-        if (nb>0) &
-           call MPI_Bcast(hvb,nb,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr)
-#endif
-        nb = 0
-        do ic=ics,ice
-          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
-          hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
-          nstor = nstor+1
-          nb = nb+l_rows
-        enddo
-
-        ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
-        if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then
-
-          ! Calculate scalar products of stored vectors.
-          ! This can be done in different ways, we use zherk
-
-          tmat = 0
-          if (l_rows>0) &
-             call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,dim=1),CZERO,tmat,max_stored_rows)
-
-          nc = 0
-          do n=1,nstor-1
-            h1(nc+1:nc+n) = tmat(1:n,n+1)
-            nc = nc+n
-          enddo
-#ifdef WITH_MPI
-          if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-          if (nc>0) h2=h1
-#endif
-          ! Calculate triangular matrix T
-
-          nc = 0
-          tmat(1,1) = tau(ice-nstor+1)
-          do n=1,nstor-1
-            call ztrmv('L','C','N',n,tmat,max_stored_rows,h2(nc+1),1)
-            tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1)
-            tmat(n+1,n+1) = tau(ice-nstor+n+1)
-            nc = nc+n
-          enddo
-
-          ! Q = Q - V * T * V**T * Q
-
-          if (l_rows>0) then
-            call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), &
-                        q,ldq,CZERO,tmp1,nstor)
-          else
-            tmp1(1:l_cols*nstor) = 0
-          endif
-#ifdef WITH_MPI
-          call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-          tmp2 = tmp1
-#endif
-          if (l_rows>0) then
-            call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor)
-            call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), &
-                        tmp2,nstor,CONE,q,ldq)
-          endif
-          nstor = 0
-        endif
-
-      enddo
-
-      deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"trans_ev_complex: error when deallocating hvb "//errorMessage
-       stop
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_complex")
-#endif
-
-    end subroutine trans_ev_complex
-
-    subroutine mult_ah_b_complex(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_comm_rows, mpi_comm_cols, c, ldc)
-
-    !-------------------------------------------------------------------------------
-    !  mult_ah_b_complex:  Performs C := A**H * B
-    !
-    !      where:  A is a square matrix (na,na) which is optionally upper or lower triangular
-    !              B is a (na,ncb) matrix
-    !              C is a (na,ncb) matrix where optionally only the upper or lower
-    !              triangle may be computed
-    !
-    !  Parameters
-    !
-    !  uplo_a      'U' if A is upper triangular
-    !              'L' if A is lower triangular
-    !              anything else if A is a full matrix
-    !              Please note: This pertains to the original A (as set in the calling program)
-    !              whereas the transpose of A is used for calculations
-    !              If uplo_a is 'U' or 'L', the other triangle is not used at all,
-    !              i.e. it may contain arbitrary numbers
-    !
-    !  uplo_c      'U' if only the upper diagonal part of C is needed
-    !              'L' if only the upper diagonal part of C is needed
-    !              anything else if the full matrix C is needed
-    !              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
-    !              written to a certain extent, i.e. one shouldn't rely on the content there!
-    !
-    !  na          Number of rows/columns of A, number of rows of B and C
-    !
-    !  ncb         Number of columns  of B and C
-    !
-    !  a           Matrix A
-    !
-    !  lda         Leading dimension of a
-    !
-    !  b           Matrix B
-    !
-    !  ldb         Leading dimension of b
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  c           Matrix C
-    !
-    !  ldc         Leading dimension of c
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      character*1                   :: uplo_a, uplo_c
-
-      integer(kind=ik)              :: na, ncb, lda, ldb, nblk, mpi_comm_rows, mpi_comm_cols, ldc
-      complex(kind=ck)              :: a(lda,*), b(ldb,*), c(ldc,*) ! remove assumed size!
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: l_cols, l_rows, l_rows_np
-      integer(kind=ik)              :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
-      integer(kind=ik)              :: gcol_min, gcol, goff
-      integer(kind=ik)              :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
-      integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)
-
-      logical                       :: a_lower, a_upper, c_lower, c_upper
-
-      complex(kind=ck), allocatable :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:)
-      integer(kind=ik)              :: istat
-      character(200)                :: errorMessage
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("mult_ah_b_complex")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      l_rows = local_index(na,  my_prow, np_rows, nblk, -1) ! Local rows of a and b
-      l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b
-
-      ! Block factor for matrix multiplications, must be a multiple of nblk
-
-      if (na/np_rows<=256) then
-        nblk_mult = (31/nblk+1)*nblk
-      else
-        nblk_mult = (63/nblk+1)*nblk
-      endif
-
-      allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"mult_ah_b_complex: error when allocating aux_mat "//errorMessage
-       stop
-      endif
-
-      allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"mult_ah_b_complex: error when allocating aux_bc "//errorMessage
-       stop
-      endif
-
-      allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"mult_ah_b_complex: error when allocating lrs_save "//errorMessage
-       stop
-      endif
-
-      allocate(lre_save(nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-       print *,"mult_ah_b_complex: error when allocating lre_save "//errorMessage
-       stop
-      endif
-
-      a_lower = .false.
-      a_upper = .false.
-      c_lower = .false.
-      c_upper = .false.
-
-      if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
-      if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
-      if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
-      if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
-
-      ! Build up the result matrix by processor rows
-
-      do np = 0, np_rows-1
-
-        ! In this turn, procs of row np assemble the result
-
-        l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
-
-        nr_done = 0 ! Number of rows done
-        aux_mat = 0
-        nstor = 0   ! Number of columns stored in aux_mat
-
-        ! Loop over the blocks on row np
-
-        do nb=0,(l_rows_np-1)/nblk
-
-          goff  = nb*np_rows + np ! Global offset in blocks corresponding to nb
-
-          ! Get the processor column which owns this block (A is transposed, so we need the column)
-          ! and the offset in blocks within this column.
-          ! The corresponding block column in A is then broadcast to all for multiplication with B
-
-          np_bc = MOD(goff,np_cols)
-          noff = goff/np_cols
-          n_aux_bc = 0
-
-          ! Gather up the complete block column of A on the owner
-
-          do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast
-
-            gcol = goff*nblk + n ! global column corresponding to n
-            if (nstor==0 .and. n==1) gcol_min = gcol
-
-            lrs = 1       ! 1st local row number for broadcast
-            lre = l_rows  ! last local row number for broadcast
-            if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
-            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
-
-            if (lrs<=lre) then
-              nvals = lre-lrs+1
-              if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
-              n_aux_bc = n_aux_bc + nvals
-            endif
-
-            lrs_save(n) = lrs
-            lre_save(n) = lre
-
-          enddo
-
-          ! Broadcast block column
-#ifdef WITH_MPI
-          call MPI_Bcast(aux_bc,n_aux_bc,MPI_DOUBLE_COMPLEX,np_bc,mpi_comm_cols,mpierr)
-#endif
-          ! Insert what we got in aux_mat
-
-          n_aux_bc = 0
-          do n = 1, min(l_rows_np-nb*nblk,nblk)
-            nstor = nstor+1
-            lrs = lrs_save(n)
-            lre = lre_save(n)
-            if (lrs<=lre) then
-              nvals = lre-lrs+1
-              aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
-              n_aux_bc = n_aux_bc + nvals
-            endif
-          enddo
-
-          ! If we got nblk_mult columns in aux_mat or this is the last block
-          ! do the matrix multiplication
-
-          if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then
-
-            lrs = 1       ! 1st local row number for multiply
-            lre = l_rows  ! last local row number for multiply
-            if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
-            if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
-
-            lcs = 1       ! 1st local col number for multiply
-            lce = l_cols  ! last local col number for multiply
-            if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
-            if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)
-
-            if (lcs<=lce) then
-              allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage)
-              if (istat .ne. 0) then
-                print *,"mult_ah_b_complex: error when allocating tmp1 "//errorMessage
-                stop
-              endif
-
-              if (lrs<=lre) then
-                call zgemm('C','N',nstor,lce-lcs+1,lre-lrs+1,(1.d0,0.d0),aux_mat(lrs,1),ubound(aux_mat,dim=1), &
-                             b(lrs,lcs),ldb,(0.d0,0.d0),tmp1,nstor)
-               else
-                 tmp1 = 0
-               endif
-
-               ! Sum up the results and send to processor row np
-#ifdef WITH_MPI
-               call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_DOUBLE_COMPLEX,MPI_SUM,np,mpi_comm_rows,mpierr)
-#else
-               tmp2 = tmp1
-#endif
-               ! Put the result into C
-               if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
-
-               deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
-               if (istat .ne. 0) then
-                 print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage
-                 stop
-               endif
-
-            endif
-
-            nr_done = nr_done+nstor
-            nstor=0
-            aux_mat(:,:)=0
-          endif
-        enddo
-      enddo
-
-      deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"mult_ah_b_complex: error when deallocating aux_mat "//errorMessage
-        stop
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("mult_ah_b_complex")
-#endif
-
-    end subroutine mult_ah_b_complex
-
-    subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success )
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=rk)                 :: d(na), e(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)                 :: q(ldq,*)
-#else
-      real(kind=rk)                 :: q(ldq,matrixCols)
-#endif
-
-      integer(kind=ik)              :: i, j, n, np, nc, nev1, l_cols, l_rows
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-
-      integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
-
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-      integer(kind=ik)              :: istat
-      character(200)                :: errorMessage
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("solve_tridi")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      success = .true.
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-      ! Set Q to 0
-
-      q(1:l_rows, 1:l_cols) = 0.
-
-      ! Get the limits of the subdivisons, each subdivison has as many cols
-      ! as fit on the respective processor column
-
-      allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi: error when allocating limits "//errorMessage
-        stop
-      endif
-
-      limits(0) = 0
-      do np=0,np_cols-1
-        nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np
-
-        ! Check for the case that a column has have zero width.
-        ! This is not supported!
-        ! Scalapack supports it but delivers no results for these columns,
-        ! which is rather annoying
-        if (nc==0) then
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("solve_tridi")
-#endif
-          if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
-          success = .false.
-          return
-        endif
-        limits(np+1) = limits(np) + nc
-      enddo
-
-      ! Subdivide matrix by subtracting rank 1 modifications
-
-      do i=1,np_cols-1
-        n = limits(i)
-        d(n) = d(n)-abs(e(n))
-        d(n+1) = d(n+1)-abs(e(n))
-      enddo
-
-      ! Solve sub problems on processsor columns
-
-      nc = limits(my_pcol) ! column after which my problem starts
-
-      if (np_cols>1) then
-        nev1 = l_cols ! all eigenvectors are needed
-      else
-        nev1 = MIN(nev,l_cols)
-      endif
-      call solve_tridi_col(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk,  &
-                        matrixCols, mpi_comm_rows, wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("solve_tridi")
-#endif
-        return
-      endif
-      ! If there is only 1 processor column, we are done
-
-      if (np_cols==1) then
-        deallocate(limits, stat=istat, errmsg=errorMessage)
-        if (istat .ne. 0) then
-          print *,"solve_tridi: error when deallocating limits "//errorMessage
-          stop
-        endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("solve_tridi")
-#endif
-        return
-      endif
-
-      ! Set index arrays for Q columns
-
-      ! Dense distribution scheme:
-
-      allocate(l_col(na), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi: error when allocating l_col "//errorMessage
-        stop
-      endif
-
-      allocate(p_col(na), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi: error when allocating p_col "//errorMessage
-        stop
-      endif
-
-      n = 0
-      do np=0,np_cols-1
-        nc = local_index(na, np, np_cols, nblk, -1)
-        do i=1,nc
-          n = n+1
-          l_col(n) = i
-          p_col(n) = np
-        enddo
-      enddo
-
-      ! Block cyclic distribution scheme, only nev columns are set:
-
-      allocate(l_col_bc(na), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi: error when allocating l_col_bc "//errorMessage
-        stop
-      endif
-
-      allocate(p_col_bc(na), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi: error when allocating p_col_bc "//errorMessage
-        stop
-      endif
-
-      p_col_bc(:) = -1
-      l_col_bc(:) = -1
-
-      do i = 0, na-1, nblk*np_cols
-        do j = 0, np_cols-1
-          do n = 1, nblk
-            if (i+j*nblk+n <= MIN(nev,na)) then
-              p_col_bc(i+j*nblk+n) = j
-              l_col_bc(i+j*nblk+n) = i/np_cols + n
-             endif
-           enddo
-         enddo
-      enddo
-
-      ! Recursively merge sub problems
-
-      call merge_recursive(0, np_cols, wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("solve_tridi")
-#endif
-        return
-      endif
-
-      deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi: error when deallocating l_col "//errorMessage
-        stop
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("solve_tridi")
-#endif
-      return
-
-      contains
-        recursive subroutine merge_recursive(np_off, nprocs, wantDebug, success)
-           use precision
-           implicit none
-
-           ! noff is always a multiple of nblk_ev
-           ! nlen-noff is always > nblk_ev
-
-           integer(kind=ik)     :: np_off, nprocs
-           integer(kind=ik)     :: np1, np2, noff, nlen, nmid, n
-#ifdef WITH_MPI
-           integer(kind=ik)     :: mpi_status(mpi_status_size)
-#endif
-           logical, intent(in)  :: wantDebug
-           logical, intent(out) :: success
-
-           success = .true.
-
-           if (nprocs<=1) then
-             ! Safety check only
-             if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs
-             success = .false.
-             return
-           endif
-           ! Split problem into 2 subproblems of size np1 / np2
-
-           np1 = nprocs/2
-           np2 = nprocs-np1
-
-           if (np1 > 1) call merge_recursive(np_off, np1, wantDebug, success)
-           if (.not.(success)) return
-           if (np2 > 1) call merge_recursive(np_off+np1, np2, wantDebug, success)
-           if (.not.(success)) return
-
-           noff = limits(np_off)
-           nmid = limits(np_off+np1) - noff
-           nlen = limits(np_off+nprocs) - noff
-
-#ifdef WITH_MPI
-           if (my_pcol==np_off) then
-             do n=np_off+np1,np_off+nprocs-1
-               call mpi_send(d(noff+1),nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr)
-             enddo
-           endif
-#endif
-
-           if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
-#ifdef WITH_MPI
-             call mpi_recv(d(noff+1),nmid,MPI_REAL8,np_off,1,mpi_comm_cols,mpi_status,mpierr)
-#else
-             d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
-#endif
-           endif
-
-           if (my_pcol==np_off+np1) then
-             do n=np_off,np_off+np1-1
-#ifdef WITH_MPI
-               call mpi_send(d(noff+nmid+1),nlen-nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr)
-#endif
-             enddo
-           endif
-           if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
-#ifdef WITH_MPI
-             call mpi_recv(d(noff+nmid+1),nlen-nmid,MPI_REAL8,np_off+np1,1,mpi_comm_cols,mpi_status,mpierr)
-#else
-             d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) 
-#endif
-           endif
-           if (nprocs == np_cols) then
-
-             ! Last merge, result distribution must be block cyclic, noff==0,
-             ! p_col_bc is set so that only nev eigenvalues are calculated
-
-             call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
-                                 nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
-                                 l_col_bc, p_col_bc, np_off, nprocs, wantDebug, success )
-             if (.not.(success)) return
-           else
-             ! Not last merge, leave dense column distribution
-
-             call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
-                                 nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
-                                 l_col(noff+1), p_col(noff+1), np_off, nprocs, wantDebug, success )
-             if (.not.(success)) return
-           endif
-
-       end subroutine merge_recursive
-
-    end subroutine solve_tridi
-
-    subroutine solve_tridi_col( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
-
-   ! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
-   ! with the divide and conquer method.
-   ! Works best if the number of processor rows is a power of 2!
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
-      real(kind=rk)                 :: d(na), e(na)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)                 :: q(ldq,*)
-#else
-      real(kind=rk)                 :: q(ldq,matrixCols)
-#endif
-
-      integer(kind=ik), parameter   :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
-
-      real(kind=rk), allocatable    :: qmat1(:,:), qmat2(:,:)
-      integer(kind=ik)              :: i, n, np
-      integer(kind=ik)              :: ndiv, noff, nmid, nlen, max_size
-      integer(kind=ik)              :: my_prow, np_rows, mpierr
-
-      integer(kind=ik), allocatable :: limits(:), l_col(:), p_col_i(:), p_col_o(:)
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-      integer(kind=ik)              :: istat
-      character(200)                :: errorMessage
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("solve_tridi_col")
-#endif
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      success = .true.
-      ! Calculate the number of subdivisions needed.
-
-      n = na
-      ndiv = 1
-      do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size)
-        n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries
-        ndiv = ndiv*2
-      enddo
-
-      ! If there is only 1 processor row and not all eigenvectors are needed
-      ! and the matrix size is big enough, then use 2 subdivisions
-      ! so that merge_systems is called once and only the needed
-      ! eigenvectors are calculated for the final problem.
-
-      if (np_rows==1 .and. nev<na .and. na>2*min_submatrix_size) ndiv = 2
-
-      allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi_col: error when allocating limits "//errorMessage
-        stop
-      endif
-
-      limits(0) = 0
-      limits(ndiv) = na
-
-      n = ndiv
-      do while(n>1)
-        n = n/2 ! n is always a power of 2
-        do i=0,ndiv-1,2*n
-          ! We want to have even boundaries (for cache line alignments)
-          limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2
-        enddo
-      enddo
-
-      ! Calculate the maximum size of a subproblem
-
-      max_size = 0
-      do i=1,ndiv
-        max_size = MAX(max_size,limits(i)-limits(i-1))
-      enddo
-
-      ! Subdivide matrix by subtracting rank 1 modifications
-
-      do i=1,ndiv-1
-        n = limits(i)
-        d(n) = d(n)-abs(e(n))
-        d(n+1) = d(n+1)-abs(e(n))
-      enddo
-
-      if (np_rows==1)    then
-
-        ! For 1 processor row there may be 1 or 2 subdivisions
-
-        do n=0,ndiv-1
-          noff = limits(n)        ! Start of subproblem
-          nlen = limits(n+1)-noff ! Size of subproblem
-
-          call solve_tridi_single(nlen,d(noff+1),e(noff+1), &
-                                    q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
-          if (.not.(success)) return
-        enddo
-
-      else
-
-        ! Solve sub problems in parallel with solve_tridi_single
-        ! There is at maximum 1 subproblem per processor
-
-        allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage)
-        if (istat .ne. 0) then
-          print *,"solve_tridi_col: error when allocating qmat1 "//errorMessage
-          stop
-        endif
-
-        allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage)
-        if (istat .ne. 0) then
-          print *,"solve_tridi_col: error when allocating qmat2 "//errorMessage
-          stop
-        endif
-
-        qmat1 = 0 ! Make sure that all elements are defined
-
-        if (my_prow < ndiv) then
-
-          noff = limits(my_prow)        ! Start of subproblem
-          nlen = limits(my_prow+1)-noff ! Size of subproblem
-
-          call solve_tridi_single(nlen,d(noff+1),e(noff+1),qmat1, &
-                                    ubound(qmat1,dim=1), wantDebug, success)
-
-          if (.not.(success)) return
-        endif
-
-        ! Fill eigenvectors in qmat1 into global matrix q
-
-        do np = 0, ndiv-1
-
-          noff = limits(np)
-          nlen = limits(np+1)-noff
-#ifdef WITH_MPI
-          call MPI_Bcast(d(noff+1),nlen,MPI_REAL8,np,mpi_comm_rows,mpierr)
-#endif
-          qmat2 = qmat1
-#ifdef WITH_MPI
-          call MPI_Bcast(qmat2,max_size*max_size,MPI_REAL8,np,mpi_comm_rows,mpierr)
-#endif
-          do i=1,nlen
-            call distribute_global_column(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
-          enddo
-
-        enddo
-
-        deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage)
-        if (istat .ne. 0) then
-          print *,"solve_tridi_col: error when deallocating qmat2 "//errorMessage
-          stop
-        endif
-
-      endif
-
-      ! Allocate and set index arrays l_col and p_col
-
-      allocate(l_col(na), p_col_i(na),  p_col_o(na), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi_col: error when allocating l_col "//errorMessage
-        stop
-      endif
-
-      do i=1,na
-        l_col(i) = i
-        p_col_i(i) = 0
-        p_col_o(i) = 0
-      enddo
-
-      ! Merge subproblems
-
-      n = 1
-      do while(n<ndiv) ! if ndiv==1, the problem was solved by single call to solve_tridi_single
-
-        do i=0,ndiv-1,2*n
-
-          noff = limits(i)
-          nmid = limits(i+n) - noff
-          nlen = limits(i+2*n) - noff
-
-          if (nlen == na) then
-            ! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
-            p_col_o(nev+1:na) = -1
-          endif
-
-          call merge_systems(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
-                               matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
-                               l_col(noff+1), p_col_o(noff+1), 0, 1, wantDebug, success)
-          if (.not.(success)) return
-
-        enddo
-
-        n = 2*n
-
-      enddo
-
-      deallocate(limits, l_col, p_col_i, p_col_o, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"solve_tridi_col: error when deallocating l_col "//errorMessage
-        stop
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("solve_tridi_col")
-#endif
-
-    end subroutine solve_tridi_col
-
-    subroutine solve_tridi_single(nlen, d, e, q, ldq, wantDebug, success)
-
-   ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
-   ! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     use precision
-     implicit none
-
-     integer(kind=ik)              :: nlen, ldq
-     real(kind=rk)                 :: d(nlen), e(nlen), q(ldq,nlen)
-
-     real(kind=rk), allocatable    :: work(:), qtmp(:), ds(:), es(:)
-     real(kind=rk)                 :: dtmp
-
-     integer(kind=ik)              :: i, j, lwork, liwork, info, mpierr
-     integer(kind=ik), allocatable :: iwork(:)
-
-     logical, intent(in)           :: wantDebug
-     logical, intent(out)          :: success
-      integer(kind=ik)             :: istat
-      character(200)               :: errorMessage
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("solve_tridi_single")
-#endif
-
-     success = .true.
-     allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
-     if (istat .ne. 0) then
-       print *,"solve_tridi_single: error when allocating ds "//errorMessage
-       stop
-     endif
-
-     ! Save d and e for the case that dstedc fails
-
-     ds(:) = d(:)
-     es(:) = e(:)
-
-     ! First try dstedc, this is normally faster but it may fail sometimes (why???)
-
-     lwork = 1 + 4*nlen + nlen**2
-     liwork =  3 + 5*nlen
-     allocate(work(lwork), iwork(liwork), stat=istat, errmsg=errorMessage)
-     if (istat .ne. 0) then
-       print *,"solve_tridi_single: error when allocating work "//errorMessage
-       stop
-     endif
-     call dstedc('I',nlen,d,e,q,ldq,work,lwork,iwork,liwork,info)
-
-     if (info /= 0) then
-
-       ! DSTEDC failed, try DSTEQR. The workspace is enough for DSTEQR.
-
-       write(error_unit,'(a,i8,a)') 'Warning: Lapack routine DSTEDC failed, info= ',info,', Trying DSTEQR!'
-
-       d(:) = ds(:)
-       e(:) = es(:)
-       call dsteqr('I',nlen,d,e,q,ldq,work,info)
-
-       ! If DSTEQR fails also, we don't know what to do further ...
-
-       if (info /= 0) then
-         if (wantDebug) &
-           write(error_unit,'(a,i8,a)') 'ELPA1_solve_tridi_single: ERROR: Lapack routine DSTEQR failed, info= ',info,', Aborting!'
-           success = .false.
-           return
-         endif
-       end if
-
-       deallocate(work,iwork,ds,es, stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"solve_tridi_single: error when deallocating ds "//errorMessage
-         stop
-       endif
-
-      ! Check if eigenvalues are monotonically increasing
-      ! This seems to be not always the case  (in the IBM implementation of dstedc ???)
-
-      do i=1,nlen-1
-        if (d(i+1)<d(i)) then
-          if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1d-14) then
-            write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1)
-          else
-            write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1)
-            write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.'
-            write(error_unit,'(a)') 'In this extent, this is completely harmless.'
-            write(error_unit,'(a)') 'Still, we keep this info message just in case.'
-          end if
-          allocate(qtmp(nlen), stat=istat, errmsg=errorMessage)
-          if (istat .ne. 0) then
-            print *,"solve_tridi_single: error when allocating qtmp "//errorMessage
-            stop
-          endif
-
-          dtmp = d(i+1)
-          qtmp(1:nlen) = q(1:nlen,i+1)
-          do j=i,1,-1
-            if (dtmp<d(j)) then
-              d(j+1)        = d(j)
-              q(1:nlen,j+1) = q(1:nlen,j)
-            else
-              exit ! Loop
-            endif
-          enddo
-          d(j+1)        = dtmp
-          q(1:nlen,j+1) = qtmp(1:nlen)
-          deallocate(qtmp, stat=istat, errmsg=errorMessage)
-          if (istat .ne. 0) then
-            print *,"solve_tridi_single: error when deallocating qtmp "//errorMessage
-            stop
-          endif
-
-       endif
-     enddo
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("solve_tridi_single")
-#endif
-
-    end subroutine solve_tridi_single
-
-    subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
-                          l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, wantDebug, success)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
-                                       mpi_comm_cols, npc_0, npc_n
-      integer(kind=ik)              :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
-      real(kind=rk)                 :: d(na), e
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)                 :: q(ldq,*)
-#else
-      real(kind=rk)                 :: q(ldq,matrixCols)
-#endif
-
-      integer(kind=ik), parameter   :: max_strip=128
-
-      real(kind=rk)                 :: beta, sig, s, c, t, tau, rho, eps, tol, dlamch, &
-                                       dlapy2, qtrans(2,2), dmax, zmax, d1new, d2new
-      real(kind=rk)                 :: z(na), d1(na), d2(na), z1(na), delta(na),  &
-                                       dbase(na), ddiff(na), ev_scale(na), tmp(na)
-      real(kind=rk)                 :: d1u(na), zu(na), d1l(na), zl(na)
-      real(kind=rk), allocatable    :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
-#ifdef WITH_OPENMP
-      real(kind=rk), allocatable    :: z_p(:,:)
-#endif
-
-      integer(kind=ik)              :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
-                                       l_rqm, ns, info
-      integer(kind=ik)              :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
-                                       l_cols_qreorg, np, l_idx, nqcols1, nqcols2
-      integer(kind=ik)              :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
-                                       np_cols, mpierr
-#ifdef WITH_MPI
-      integer(kind=ik)              :: mpi_status(mpi_status_size)
-#endif
-      integer(kind=ik)              :: np_next, np_prev, np_rem
-      integer(kind=ik)              :: idx(na), idx1(na), idx2(na)
-      integer(kind=ik)              :: coltyp(na), idxq1(na), idxq2(na)
-
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-      integer(kind=ik)              :: istat
-      character(200)                :: errorMessage
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: max_threads, my_thread
-      integer(kind=ik)              :: omp_get_max_threads, omp_get_thread_num
-
-
-      max_threads = omp_get_max_threads()
-
-      allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"merge_systems: error when allocating z_p "//errorMessage
-        stop
-      endif
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("merge_systems")
-#endif
-      success = .true.
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      ! If my processor column isn't in the requested set, do nothing
-
-      if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-      ! Determine number of "next" and "prev" column for ring sends
-
-      if (my_pcol == npc_0+npc_n-1) then
-        np_next = npc_0
-      else
-        np_next = my_pcol + 1
-      endif
-
-      if (my_pcol == npc_0) then
-        np_prev = npc_0+npc_n-1
-      else
-        np_prev = my_pcol - 1
-      endif
-
-      call check_monotony(nm,d,'Input1',wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-      call check_monotony(na-nm,d(nm+1),'Input2',wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-      ! Get global number of processors and my processor number.
-      ! Please note that my_proc does not need to match any real processor number,
-      ! it is just used for load balancing some loops.
-
-      n_procs = np_rows*npc_n
-      my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
-
-
-      ! Local limits of the rows of Q
-
-      l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
-      l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
-      l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
-
-      l_rnm  = l_rqm-l_rqs+1 ! Number of local rows <= nm
-      l_rows = l_rqe-l_rqs+1 ! Total number of local rows
-
-
-      ! My number of local columns
-
-      l_cols = COUNT(p_col(1:na)==my_pcol)
-
-      ! Get max number of local columns
-
-      max_local_cols = 0
-      do np = npc_0, npc_0+npc_n-1
-        max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np))
-      enddo
-
-      ! Calculations start here
-
-      beta = abs(e)
-      sig  = sign(1.d0,e)
-
-      ! Calculate rank-1 modifier z
-
-      z(:) = 0
-
-      if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then
-        ! nm is local on my row
-        do i = 1, na
-          if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
-         enddo
-      endif
-
-      if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then
-        ! nm+1 is local on my row
-        do i = 1, na
-          if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
-        enddo
-      endif
-
-      call global_gather(z, na)
-
-      ! Normalize z so that norm(z) = 1.  Since z is the concatenation of
-      ! two normalized vectors, norm2(z) = sqrt(2).
-
-      z = z/sqrt(2.0d0)
-      rho = 2.*beta
-
-      ! Calculate index for merging both systems by ascending eigenvalues
-
-      call DLAMRG( nm, na-nm, d, 1, 1, idx )
-
-      ! Calculate the allowable deflation tolerance
-
-      zmax = maxval(abs(z))
-      dmax = maxval(abs(d))
-      EPS = DLAMCH( 'Epsilon' )
-      TOL = 8.*EPS*MAX(dmax,zmax)
-
-      ! If the rank-1 modifier is small enough, no more needs to be done
-      ! except to reorganize D and Q
-
-      IF ( RHO*zmax <= TOL ) THEN
-
-        ! Rearrange eigenvalues
-
-        tmp = d
-        do i=1,na
-          d(i) = tmp(idx(i))
-        enddo
-
-        ! Rearrange eigenvectors
-
-        call resort_ev(idx, na)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-
-        return
-      ENDIF
-
-      ! Merge and deflate system
-
-      na1 = 0
-      na2 = 0
-
-      ! COLTYP:
-      ! 1 : non-zero in the upper half only;
-      ! 2 : dense;
-      ! 3 : non-zero in the lower half only;
-      ! 4 : deflated.
-
-      coltyp(1:nm) = 1
-      coltyp(nm+1:na) = 3
-
-      do i=1,na
-
-        if (rho*abs(z(idx(i))) <= tol) then
-
-          ! Deflate due to small z component.
-
-          na2 = na2+1
-          d2(na2)   = d(idx(i))
-          idx2(na2) = idx(i)
-          coltyp(idx(i)) = 4
-
-        else if (na1>0) then
-
-          ! Check if eigenvalues are close enough to allow deflation.
-
-          S = Z(idx(i))
-          C = Z1(na1)
-
-          ! Find sqrt(a**2+b**2) without overflow or
-          ! destructive underflow.
-
-          TAU = DLAPY2( C, S )
-          T = D1(na1) - D(idx(i))
-          C = C / TAU
-          S = -S / TAU
-          IF ( ABS( T*C*S ) <= TOL ) THEN
-
-            ! Deflation is possible.
-
-            na2 = na2+1
-
-            Z1(na1) = TAU
-
-            d2new = D(idx(i))*C**2 + D1(na1)*S**2
-            d1new = D(idx(i))*S**2 + D1(na1)*C**2
-
-            ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0
-            ! This means that after the above transformation it must be
-            !    D1(na1) <= d1new <= D(idx(i))
-            !    D1(na1) <= d2new <= D(idx(i))
-            !
-            ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1))
-            ! so there is no problem with sorting here.
-            ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1)
-            ! which makes a check (and possibly a resort) necessary.
-            !
-            ! The above relations may not hold exactly due to numeric differences
-            ! so they have to be enforced in order not to get troubles with sorting.
-
-
-            if (d1new<D1(na1)  ) d1new = D1(na1)
-            if (d1new>D(idx(i))) d1new = D(idx(i))
-
-            if (d2new<D1(na1)  ) d2new = D1(na1)
-            if (d2new>D(idx(i))) d2new = D(idx(i))
-
-            D1(na1) = d1new
-
-            do j=na2-1,1,-1
-              if (d2new<d2(j)) then
-                d2(j+1)   = d2(j)
-                idx2(j+1) = idx2(j)
-              else
-                exit ! Loop
-              endif
-            enddo
-
-            d2(j+1)   = d2new
-            idx2(j+1) = idx(i)
-
-            qtrans(1,1) = C; qtrans(1,2) =-S
-            qtrans(2,1) = S; qtrans(2,2) = C
-
-            call transform_columns(idx(i), idx1(na1))
-
-            if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
-            if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
-
-            coltyp(idx(i)) = 4
-
-          else
-            na1 = na1+1
-            d1(na1) = d(idx(i))
-            z1(na1) = z(idx(i))
-            idx1(na1) = idx(i)
-          endif
-        else
-          na1 = na1+1
-          d1(na1) = d(idx(i))
-          z1(na1) = z(idx(i))
-          idx1(na1) = idx(i)
-        endif
-
-      enddo
-      call check_monotony(na1,d1,'Sorted1', wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-      call check_monotony(na2,d2,'Sorted2', wantDebug, success)
-      if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("merge_systems")
-#endif
-        return
-      endif
-
-      if (na1==1 .or. na1==2) then
-        ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid
-
-        if (na1==1) then
-          d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
-        else ! na1==2
-          call DLAED5(1, d1, z1, qtrans(1,1), rho, d(1))
-          call DLAED5(2, d1, z1, qtrans(1,2), rho, d(2))
-
-          call transform_columns(idx1(1), idx1(2))
-        endif
-
-        ! Add the deflated eigenvalues
-        d(na1+1:na) = d2(1:na2)
-
-        ! Calculate arrangement of all eigenvalues  in output
-
-        call DLAMRG( na1, na-na1, d, 1, 1, idx )
-
-        ! Rearrange eigenvalues
-
-        tmp = d
-        do i=1,na
-          d(i) = tmp(idx(i))
-        enddo
-
-        ! Rearrange eigenvectors
-
-        do i=1,na
-          if (idx(i)<=na1) then
-            idxq1(i) = idx1(idx(i))
-          else
-            idxq1(i) = idx2(idx(i)-na1)
-          endif
-        enddo
-
-        call resort_ev(idxq1, na)
-
-      else if (na1>2) then
-
-        ! Solve secular equation
-
-        z(1:na1) = 1
-#ifdef WITH_OPENMP
-        z_p(1:na1,:) = 1
-#endif
-        dbase(1:na1) = 0
-        ddiff(1:na1) = 0
-
-        info = 0
-#ifdef WITH_OPENMP
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("OpenMP parallel")
-#endif
-
-!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
-        my_thread = omp_get_thread_num()
-!$OMP DO
-#endif
-        DO i = my_proc+1, na1, n_procs ! work distributed over all processors
-
-          call DLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
-
-          if (info/=0) then
-            ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
-            ! use the more stable bisection algorithm in solve_secular_equation
-            ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
-            call solve_secular_equation(na1, i, d1, z1, delta, rho, s)
-          endif
-
-          ! Compute updated z
-
-#ifdef WITH_OPENMP
-          do j=1,na1
-            if (i/=j)  z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
-          enddo
-          z_p(i,my_thread) = z_p(i,my_thread)*delta(i)
-#else
-          do j=1,na1
-            if (i/=j)  z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) )
-          enddo
-          z(i) = z(i)*delta(i)
-#endif
-          ! store dbase/ddiff
-
-          if (i<na1) then
-            if (abs(delta(i+1)) < abs(delta(i))) then
-              dbase(i) = d1(i+1)
-              ddiff(i) = delta(i+1)
-            else
-              dbase(i) = d1(i)
-              ddiff(i) = delta(i)
-            endif
-          else
-            dbase(i) = d1(i)
-            ddiff(i) = delta(i)
-          endif
-        enddo
-#ifdef WITH_OPENMP
-!$OMP END PARALLEL
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("OpenMP parallel")
-#endif
-
-        do i = 0, max_threads-1
-          z(1:na1) = z(1:na1)*z_p(1:na1,i)
-        enddo
-#endif
-
-        call global_product(z, na1)
-        z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
-
-        call global_gather(dbase, na1)
-        call global_gather(ddiff, na1)
-        d(1:na1) = dbase(1:na1) - ddiff(1:na1)
-
-        ! Calculate scale factors for eigenvectors
-
-        ev_scale(:) = 0.
-
-#ifdef WITH_OPENMP
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("OpenMP parallel")
-#endif
-
-!$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs,  &
-!$OMP d1,dbase, ddiff, z, ev_scale) &
-!$OMP DEFAULT(NONE)
-
-#endif
-        DO i = my_proc+1, na1, n_procs ! work distributed over all processors
-
-          ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code
-          ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
-
-          ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
-          ! in exactly this order, but we want to prevent compiler optimization
-!         ev_scale_val = ev_scale(i)
-          call add_tmp(d1, dbase, ddiff, z, ev_scale(i), na1,i)
-!         ev_scale(i) = ev_scale_val
-        enddo
-#ifdef WITH_OPENMP
-!$OMP END PARALLEL DO
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("OpenMP parallel")
-#endif
-
-#endif
-
-        call global_gather(ev_scale, na1)
-
-        ! Add the deflated eigenvalues
-        d(na1+1:na) = d2(1:na2)
-
-        ! Calculate arrangement of all eigenvalues  in output
-
-        call DLAMRG( na1, na-na1, d, 1, 1, idx )
-
-        ! Rearrange eigenvalues
-
-        tmp = d
-        do i=1,na
-          d(i) = tmp(idx(i))
-        enddo
-        call check_monotony(na,d,'Output', wantDebug, success)
-        if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("merge_systems")
-#endif
-          return
-        endif
-        ! Eigenvector calculations
-
-
-        ! Calculate the number of columns in the new local matrix Q
-        ! which are updated from non-deflated/deflated eigenvectors.
-        ! idxq1/2 stores the global column numbers.
-
-        nqcols1 = 0 ! number of non-deflated eigenvectors
-        nqcols2 = 0 ! number of deflated eigenvectors
-        DO i = 1, na
-          if (p_col_out(i)==my_pcol) then
-            if (idx(i)<=na1) then
-              nqcols1 = nqcols1+1
-              idxq1(nqcols1) = i
-            else
-              nqcols2 = nqcols2+1
-              idxq2(nqcols2) = i
-            endif
-          endif
-        enddo
-
-        allocate(ev(max_local_cols,MIN(max_strip,MAX(1,nqcols1))), stat=istat, errmsg=errorMessage)
-        if (istat .ne. 0) then
-          print *,"merge_systems: error when allocating ev "//errorMessage
-          stop
-        endif
-
-        allocate(qtmp1(MAX(1,l_rows),max_local_cols), stat=istat, errmsg=errorMessage)
-        if (istat .ne. 0) then
-          print *,"merge_systems: error when allocating qtmp1 "//errorMessage
-          stop
-        endif
-
-        allocate(qtmp2(MAX(1,l_rows),MIN(max_strip,MAX(1,nqcols1))), stat=istat, errmsg=errorMessage)
-        if (istat .ne. 0) then
-          print *,"merge_systems: error when allocating qtmp2 "//errorMessage
-          stop
-        endif
-
-        ! Gather nonzero upper/lower components of old matrix Q
-        ! which are needed for multiplication with new eigenvectors
-
-        qtmp1 = 0 ! May contain empty (unset) parts
-        qtmp2 = 0 ! Not really needed
-
-        nnzu = 0
-        nnzl = 0
-        do i = 1, na1
-          l_idx = l_col(idx1(i))
-          if (p_col(idx1(i))==my_pcol) then
-            if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
-              nnzu = nnzu+1
-              qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx)
-            endif
-            if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
-              nnzl = nnzl+1
-              qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx)
-            endif
-          endif
-        enddo
-
-        ! Gather deflated eigenvalues behind nonzero components
-
-        ndef = max(nnzu,nnzl)
-        do i = 1, na2
-          l_idx = l_col(idx2(i))
-          if (p_col(idx2(i))==my_pcol) then
-            ndef = ndef+1
-            qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx)
-          endif
-        enddo
-
-        l_cols_qreorg = ndef ! Number of columns in reorganized matrix
-
-        ! Set (output) Q to 0, it will sum up new Q
-
-        DO i = 1, na
-          if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0
-        enddo
-
-        np_rem = my_pcol
-
-        do np = 1, npc_n
-
-          ! Do a ring send of qtmp1
-
-          if (np>1) then
-
-            if (np_rem==npc_0) then
-              np_rem = npc_0+npc_n-1
-            else
-              np_rem = np_rem-1
-            endif
-#ifdef WITH_MPI
-            call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL8, &
-                                        np_next, 1111, np_prev, 1111, &
-                                        mpi_comm_cols, mpi_status, mpierr)
-#endif
-          endif
-
-          ! Gather the parts in d1 and z which are fitting to qtmp1.
-          ! This also delivers nnzu/nnzl for proc np_rem
-
-          nnzu = 0
-          nnzl = 0
-          do i=1,na1
-            if (p_col(idx1(i))==np_rem) then
-              if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then
-                nnzu = nnzu+1
-                d1u(nnzu) = d1(i)
-                zu (nnzu) = z (i)
-              endif
-              if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then
-                nnzl = nnzl+1
-                d1l(nnzl) = d1(i)
-                zl (nnzl) = z (i)
-              endif
-            endif
-          enddo
-
-          ! Set the deflated eigenvectors in Q (comming from proc np_rem)
-
-          ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix
-          do i = 1, na
-            j = idx(i)
-            if (j>na1) then
-              if (p_col(idx2(j-na1))==np_rem) then
-                ndef = ndef+1
-                if (p_col_out(i)==my_pcol) &
-                      q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef)
-              endif
-            endif
-          enddo
-
-          do ns = 0, nqcols1-1, max_strip ! strimining loop
-
-            ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip
-
-            ! Get partial result from (output) Q
-
-            do i = 1, ncnt
-              qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns)))
-            enddo
-
-            ! Compute eigenvectors of the rank-1 modified matrix.
-            ! Parts for multiplying with upper half of Q:
-
-            do i = 1, ncnt
-              j = idx(idxq1(i+ns))
-              ! Calculate the j-th eigenvector of the deflated system
-              ! See above why we are doing it this way!
-              tmp(1:nnzu) = d1u(1:nnzu)-dbase(j)
-              call v_add_s(tmp,nnzu,ddiff(j))
-              ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
-            enddo
-
-            ! Multiply old Q with eigenvectors (upper half)
-
-            if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
-                call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1,ubound(qtmp1,dim=1),ev,ubound(ev,dim=1), &
-                           1.d0,qtmp2(1,1),ubound(qtmp2,dim=1))
-
-            ! Compute eigenvectors of the rank-1 modified matrix.
-            ! Parts for multiplying with lower half of Q:
-
-            do i = 1, ncnt
-              j = idx(idxq1(i+ns))
-              ! Calculate the j-th eigenvector of the deflated system
-              ! See above why we are doing it this way!
-              tmp(1:nnzl) = d1l(1:nnzl)-dbase(j)
-              call v_add_s(tmp,nnzl,ddiff(j))
-              ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
-            enddo
-
-            ! Multiply old Q with eigenvectors (lower half)
-
-             if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
-                call dgemm('N','N',l_rows-l_rnm,ncnt,nnzl,1.d0,qtmp1(l_rnm+1,1),ubound(qtmp1,dim=1),ev,ubound(ev,dim=1), &
-                           1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,dim=1))
-
-             ! Put partial result into (output) Q
-
-             do i = 1, ncnt
-               q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i)
-             enddo
-
-           enddo
-        enddo
-
-        deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage)
-        if (istat .ne. 0) then
-          print *,"merge_systems: error when deallocating ev "//errorMessage
-          stop
-        endif
-      endif
-
-#ifdef WITH_OPENMP
-      deallocate(z_p, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"merge_systems: error when deallocating z_p "//errorMessage
-        stop
-      endif
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("merge_systems")
-#endif
-
-
-      return
-
-      contains
-        subroutine add_tmp(d1, dbase, ddiff, z, ev_scale_value, na1,i)
-          use precision
-          implicit none
-
-          integer(kind=ik), intent(in) :: na1, i
-
-          real(kind=rk), intent(in)    :: d1(:), dbase(:), ddiff(:), z(:)
-          real(kind=rk), intent(inout) :: ev_scale_value
-          real(kind=rk)                :: tmp(1:na1)
-
-               ! tmp(1:na1) = z(1:na1) / delta(1:na1,i)  ! original code
-               ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results
-
-               ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
-               ! in exactly this order, but we want to prevent compiler optimization
-
-          tmp(1:na1) = d1(1:na1) -dbase(i)
-          call v_add_s(tmp(1:na1),na1,ddiff(i))
-
-          tmp(1:na1) = z(1:na1) / tmp(1:na1)
-
-          ev_scale_value = 1.0/sqrt(dot_product(tmp(1:na1),tmp(1:na1)))
-
-        end subroutine add_tmp
-
-        subroutine resort_ev(idx_ev, nLength)
-          use precision
-          implicit none
-
-          integer(kind=ik), intent(in) :: nLength
-          integer(kind=ik)             :: idx_ev(nLength)
-          integer(kind=ik)             :: i, nc, pc1, pc2, lc1, lc2, l_cols_out
-
-          real(kind=rk), allocatable   :: qtmp(:,:)
-          integer(kind=ik)             :: istat
-          character(200)               :: errorMessage
-
-          if (l_rows==0) return ! My processor column has no work to do
-
-          ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i))
-
-          l_cols_out = COUNT(p_col_out(1:na)==my_pcol)
-          allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage)
-          if (istat .ne. 0) then
-            print *,"resort_ev: error when allocating qtmp "//errorMessage
-            stop
-          endif
-
-          nc = 0
-
-          do i=1,na
-
-            pc1 = p_col(idx_ev(i))
-            lc1 = l_col(idx_ev(i))
-            pc2 = p_col_out(i)
-
-            if (pc2<0) cycle ! This column is not needed in output
-
-            if (pc2==my_pcol) nc = nc+1 ! Counter for output columns
-
-            if (pc1==my_pcol) then
-              if (pc2==my_pcol) then
-                ! send and recieve column are local
-                qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
-              else
-#ifdef WITH_MPI
-                call mpi_send(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,mod(i,4096),mpi_comm_cols,mpierr)
-#endif
-              endif
-            else if (pc2==my_pcol) then
-#ifdef WITH_MPI
-              call mpi_recv(qtmp(1,nc),l_rows,MPI_REAL8,pc1,mod(i,4096),mpi_comm_cols,mpi_status,mpierr)
-#else
-              qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,nc)
-#endif
-            endif
-          enddo
-
-          ! Insert qtmp into (output) q
-
-          nc = 0
-
-          do i=1,na
-
-            pc2 = p_col_out(i)
-            lc2 = l_col_out(i)
-
-            if (pc2==my_pcol) then
-              nc = nc+1
-              q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc)
-            endif
-          enddo
-
-          deallocate(qtmp, stat=istat, errmsg=errorMessage)
-          if (istat .ne. 0) then
-            print *,"resort_ev: error when deallocating qtmp "//errorMessage
-            stop
-          endif
-        end subroutine resort_ev
-
-        subroutine transform_columns(col1, col2)
-          use precision
-          implicit none
-
-          integer(kind=ik) :: col1, col2
-          integer(kind=ik) :: pc1, pc2, lc1, lc2
-
-          if (l_rows==0) return ! My processor column has no work to do
-
-          pc1 = p_col(col1)
-          lc1 = l_col(col1)
-          pc2 = p_col(col2)
-          lc2 = l_col(col2)
-
-          if (pc1==my_pcol) then
-            if (pc2==my_pcol) then
-              ! both columns are local
-              tmp(1:l_rows)      = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1)
-              q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
-              q(l_rqs:l_rqe,lc1) = tmp(1:l_rows)
-            else
-#ifdef WITH_MPI
-              call mpi_sendrecv(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,1, &
-                                  tmp,l_rows,MPI_REAL8,pc2,1, &
-                                  mpi_comm_cols,mpi_status,mpierr)
-#else
-              tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)
-#endif
-              q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1)
-            endif
-          else if (pc2==my_pcol) then
-#ifdef WITH_MPI
-            call mpi_sendrecv(q(l_rqs,lc2),l_rows,MPI_REAL8,pc1,1, &
-                               tmp,l_rows,MPI_REAL8,pc1,1, &
-                               mpi_comm_cols,mpi_status,mpierr)
-#else
-            tmp(1:l_rows) = q(l_rqs:l_rqe,lc2)
-#endif
-            q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
-          endif
-
-        end subroutine transform_columns
-
-        subroutine global_gather(z, n)
-
-          ! This routine sums up z over all processors.
-          ! It should only be used for gathering distributed results,
-          ! i.e. z(i) should be nonzero on exactly 1 processor column,
-          ! otherways the results may be numerically different on different columns
-          use precision
-          implicit none
-
-          integer(kind=ik) :: n
-          real(kind=rk)    :: z(n)
-          real(kind=rk)    :: tmp(n)
-
-          if (npc_n==1 .and. np_rows==1) return ! nothing to do
-
-          ! Do an mpi_allreduce over processor rows
-#ifdef WITH_MPI
-          call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
-#else
-          tmp = z
-#endif
-          ! If only 1 processor column, we are done
-          if (npc_n==1) then
-            z(:) = tmp(:)
-            return
-          endif
-
-          ! If all processor columns are involved, we can use mpi_allreduce
-          if (npc_n==np_cols) then
-#ifdef WITH_MPI
-            call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr)
-#else
-            tmp = z
-#endif
-            return
-          endif
-
-          ! Do a ring send over processor columns
-          z(:) = 0
-          do np = 1, npc_n
-            z(:) = z(:) + tmp(:)
-#ifdef WITH_MPI
-            call MPI_Sendrecv_replace(z, n, MPI_REAL8, np_next, 1111, np_prev, 1111, &
-                                       mpi_comm_cols, mpi_status, mpierr)
-#endif
-          enddo
-
-        end subroutine global_gather
-
-        subroutine global_product(z, n)
-
-          ! This routine calculates the global product of z.
-          use precision
-          implicit none
-
-          integer(kind=ik) :: n
-          real(kind=rk)    :: z(n)
-
-          real(kind=rk)    :: tmp(n)
-
-          if (npc_n==1 .and. np_rows==1) return ! nothing to do
-
-          ! Do an mpi_allreduce over processor rows
-#ifdef WITH_MPI
-          call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_PROD, mpi_comm_rows, mpierr)
-#else
-          tmp = z
-#endif
-          ! If only 1 processor column, we are done
-          if (npc_n==1) then
-            z(:) = tmp(:)
-            return
-          endif
-
-          ! If all processor columns are involved, we can use mpi_allreduce
-          if (npc_n==np_cols) then
-#ifdef WITH_MPI
-            call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_PROD, mpi_comm_cols, mpierr)
-#else
-            z = tmp
-#endif
-            return
-          endif
-
-          ! We send all vectors to the first proc, do the product there
-          ! and redistribute the result.
-
-          if (my_pcol == npc_0) then
-            z(1:n) = tmp(1:n)
-            do np = npc_0+1, npc_0+npc_n-1
-#ifdef WITH_MPI
-              call mpi_recv(tmp,n,MPI_REAL8,np,1111,mpi_comm_cols,mpi_status,mpierr)
-#else
-              tmp(1:n) = z(1:n)
-#endif
-              z(1:n) = z(1:n)*tmp(1:n)
-            enddo
-            do np = npc_0+1, npc_0+npc_n-1
-#ifdef WITH_MPI
-              call mpi_send(z,n,MPI_REAL8,np,1111,mpi_comm_cols,mpierr)
-#endif
-            enddo
-          else
-#ifdef WITH_MPI
-            call mpi_send(tmp,n,MPI_REAL8,npc_0,1111,mpi_comm_cols,mpierr)
-            call mpi_recv(z  ,n,MPI_REAL8,npc_0,1111,mpi_comm_cols,mpi_status,mpierr)
-#else
-            z(1:n) = tmp(1:n)
-#endif
-          endif
-
-        end subroutine global_product
-
-        subroutine check_monotony(n,d,text, wantDebug, success)
-
-        ! This is a test routine for checking if the eigenvalues are monotonically increasing.
-        ! It is for debug purposes only, an error should never be triggered!
-          use precision
-          implicit none
-
-          integer(kind=ik)              :: n
-          real(kind=rk)                 :: d(n)
-          character*(*)                 :: text
-
-          integer(kind=ik)              :: i
-          logical, intent(in)           :: wantDebug
-          logical, intent(out)          :: success
-
-          success = .true.
-          do i=1,n-1
-            if (d(i+1)<d(i)) then
-              if (wantDebug) write(error_unit,'(a,a,i8,2g25.17)') 'ELPA1_check_monotony: Monotony error on ',text,i,d(i),d(i+1)
-              success = .false.
-              return
-            endif
-          enddo
-        end subroutine check_monotony
-
-    end subroutine merge_systems
-
-    subroutine v_add_s(v,n,s)
-      use precision
-      implicit none
-      integer(kind=ik) :: n
-      real(kind=rk)    :: v(n),s
-
-      v(:) = v(:) + s
-    end subroutine v_add_s
-
-    subroutine distribute_global_column(g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
-      use precision
-      implicit none
-
-      real(kind=rk)     :: g_col(nlen), l_col(*) ! chnage this to proper 2d 1d matching
-      integer(kind=ik)  :: noff, nlen, my_prow, np_rows, nblk
-
-      integer(kind=ik)  :: nbs, nbe, jb, g_off, l_off, js, je
-
-      nbs = noff/(nblk*np_rows)
-      nbe = (noff+nlen-1)/(nblk*np_rows)
-
-      do jb = nbs, nbe
-
-        g_off = jb*nblk*np_rows + nblk*my_prow
-        l_off = jb*nblk
-
-        js = MAX(noff+1-g_off,1)
-        je = MIN(noff+nlen-g_off,nblk)
-
-        if (je<js) cycle
-
-        l_col(l_off+js:l_off+je) = g_col(g_off+js-noff:g_off+je-noff)
-
-      enddo
-
-    end subroutine distribute_global_column
-
-    subroutine solve_secular_equation(n, i, d, z, delta, rho, dlam)
-
-    !-------------------------------------------------------------------------------
-    ! This routine solves the secular equation of a symmetric rank 1 modified
-    ! diagonal matrix:
-    !
-    !    1. + rho*SUM(z(:)**2/(d(:)-x)) = 0
-    !
-    ! It does the same as the LAPACK routine DLAED4 but it uses a bisection technique
-    ! which is more robust (it always yields a solution) but also slower
-    ! than the algorithm used in DLAED4.
-    !
-    ! The same restictions than in DLAED4 hold, namely:
-    !
-    !   rho > 0   and   d(i+1) > d(i)
-    !
-    ! but this routine will not terminate with error if these are not satisfied
-    ! (it will normally converge to a pole in this case).
-    !
-    ! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases
-    ! N=1 and N=2 which is not compatible with DLAED4.
-    ! Thus this routine shouldn't be used for these cases as a simple replacement
-    ! of DLAED4.
-    !
-    ! The arguments are the same as in DLAED4 (with the exception of the INFO argument):
-    !
-    !
-    !  N      (input) INTEGER
-    !         The length of all arrays.
-    !
-    !  I      (input) INTEGER
-    !         The index of the eigenvalue to be computed.  1 <= I <= N.
-    !
-    !  D      (input) DOUBLE PRECISION array, dimension (N)
-    !         The original eigenvalues.  It is assumed that they are in
-    !         order, D(I) < D(J)  for I < J.
-    !
-    !  Z      (input) DOUBLE PRECISION array, dimension (N)
-    !         The components of the updating vector.
-    !
-    !  DELTA  (output) DOUBLE PRECISION array, dimension (N)
-    !         DELTA contains (D(j) - lambda_I) in its  j-th component.
-    !         See remark above about DLAED4 compatibility!
-    !
-    !  RHO    (input) DOUBLE PRECISION
-    !         The scalar in the symmetric updating formula.
-    !
-    !  DLAM   (output) DOUBLE PRECISION
-    !         The computed lambda_I, the I-th updated eigenvalue.
-    !-------------------------------------------------------------------------------
-
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)   :: n, i
-      real(kind=rk)      :: d(n), z(n), delta(n), rho, dlam
-
-      integer(kind=ik)   :: iter
-      real(kind=rk)      :: a, b, x, y, dshift
-
-      ! In order to obtain sufficient numerical accuracy we have to shift the problem
-      ! either by d(i) or d(i+1), whichever is closer to the solution
-
-      ! Upper and lower bound of the shifted solution interval are a and b
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call  timer%start("solve_secular_equation")
-#endif
-      if (i==n) then
-
-       ! Special case: Last eigenvalue
-       ! We shift always by d(n), lower bound is d(n),
-       ! upper bound is determined by a guess:
-
-       dshift = d(n)
-       delta(:) = d(:) - dshift
-
-       a = 0. ! delta(n)
-       b = rho*SUM(z(:)**2) + 1. ! rho*SUM(z(:)**2) is the lower bound for the guess
-
-      else
-
-        ! Other eigenvalues: lower bound is d(i), upper bound is d(i+1)
-        ! We check the sign of the function in the midpoint of the interval
-        ! in order to determine if eigenvalue is more close to d(i) or d(i+1)
-
-        x = 0.5*(d(i)+d(i+1))
-        y = 1. + rho*SUM(z(:)**2/(d(:)-x))
-
-        if (y>0) then
-          ! solution is next to d(i)
-          dshift = d(i)
-        else
-          ! solution is next to d(i+1)
-          dshift = d(i+1)
-        endif
-
-        delta(:) = d(:) - dshift
-        a = delta(i)
-        b = delta(i+1)
-
-      endif
-
-      ! Bisection:
-
-      do iter=1,200
-
-        ! Interval subdivision
-
-        x = 0.5*(a+b)
-
-        if (x==a .or. x==b) exit   ! No further interval subdivisions possible
-        if (abs(x) < 1.d-200) exit ! x next to pole
-
-        ! evaluate value at x
-
-        y = 1. + rho*SUM(z(:)**2/(delta(:)-x))
-
-        if (y==0) then
-          ! found exact solution
-          exit
-        elseif (y>0) then
-          b = x
-        else
-          a = x
-        endif
-
-      enddo
-
-      ! Solution:
-
-      dlam = x + dshift
-      delta(:) = delta(:) - x
-#ifdef HAVE_DETAILED_TIMINGS
-      call  timer%stop("solve_secular_equation")
-#endif
-
-    end subroutine solve_secular_equation
-
-    !-------------------------------------------------------------------------------
-
-    integer function local_index(idx, my_proc, num_procs, nblk, iflag)
-
-    !-------------------------------------------------------------------------------
-    !  local_index: returns the local index for a given global index
-    !               If the global index has no local index on the
-    !               processor my_proc behaviour is defined by iflag
-    !
-    !  Parameters
-    !
-    !  idx         Global index
-    !
-    !  my_proc     Processor row/column for which to calculate the local index
-    !
-    !  num_procs   Total number of processors along row/column
-    !
-    !  nblk        Blocksize
-    !
-    !  iflag       Controls the behaviour if idx is not on local processor
-    !              iflag< 0 : Return last local index before that row/col
-    !              iflag==0 : Return 0
-    !              iflag> 0 : Return next local index after that row/col
-    !-------------------------------------------------------------------------------
-      use precision
-      implicit none
-
-      integer(kind=ik) :: idx, my_proc, num_procs, nblk, iflag
-
-      integer(kind=ik) :: iblk
-
-      iblk = (idx-1)/nblk  ! global block number, 0 based
-
-      if (mod(iblk,num_procs) == my_proc) then
-
-        ! block is local, always return local row/col number
-
-        local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1
-
-      else
-
-        ! non local block
-
-        if (iflag == 0) then
-
-          local_index = 0
-
-        else
-
-          local_index = (iblk/num_procs)*nblk
-
-          if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk
-
-          if (iflag>0) local_index = local_index + 1
-        endif
-      endif
-
-    end function local_index
-
-    subroutine cholesky_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  cholesky_real: Cholesky factorization of a real symmetric matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be factorized.
-    !              Distribution is like in Scalapack.
-    !              Only upper triangle is needs to be set.
-    !              On return, the upper triangle contains the Cholesky factor
-    !              and the lower triangle is set to 0.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=rk)                 :: a(lda,matrixCols)
-      ! was
-      ! real a(lda, *)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
-      integer(kind=ik)              :: n, nc, i, info
-      integer(kind=ik)              :: lcs, lce, lrs, lre
-      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
-
-      real(kind=rk), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
-
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-      integer(kind=ik)              :: istat
-      character(200)                :: errorMessage
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("cholesky_real")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      success = .true.
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-
-      allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_real: error when allocating tmp1 "//errorMessage
-        stop
-      endif
-
-      allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_real: error when allocating tmp2 "//errorMessage
-        stop
-      endif
-
-      tmp1 = 0
-      tmp2 = 0
-
-      allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_real: error when allocating tmatr "//errorMessage
-        stop
-      endif
-
-      allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_real: error when allocating tmatc "//errorMessage
-        stop
-      endif
-
-      tmatr = 0
-      tmatc = 0
-
-      do n = 1, na, nblk
-
-        ! Calculate first local row and column of the still remaining matrix
-        ! on the local processor
-
-        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
-        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
-
-        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
-        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
-
-        if (n+nblk > na) then
-
-          ! This is the last step, just do a Cholesky-Factorization
-          ! of the remaining block
-
-          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
-
-            call dpotrf('U',na-n+1,a(l_row1,l_col1),lda,info)
-            if (info/=0) then
-              if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf"
-              success = .false.
-              return
-            endif
-
-          endif
-
-          exit ! Loop
-
-        endif
-
-        if (my_prow==prow(n, nblk, np_rows)) then
-
-          if (my_pcol==pcol(n, nblk, np_cols)) then
-
-            ! The process owning the upper left remaining block does the
-            ! Cholesky-Factorization of this block
-
-            call dpotrf('U',nblk,a(l_row1,l_col1),lda,info)
-            if (info/=0) then
-              if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf"
-              success = .false.
-              return
-            endif
-
-            nc = 0
-            do i=1,nblk
-              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
-              nc = nc+i
-            enddo
-          endif
-#ifdef WITH_MPI
-          call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-          nc = 0
-          do i=1,nblk
-            tmp2(1:i,i) = tmp1(nc+1:nc+i)
-            nc = nc+i
-          enddo
-
-          if (l_cols-l_colx+1>0) &
-              call dtrsm('L','U','T','N',nblk,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
-
-        endif
-
-        do i=1,nblk
-
-          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols)
-#ifdef WITH_MPI
-          if (l_cols-l_colx+1>0) &
-              call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
-#endif
-        enddo
-        ! this has to be checked since it was changed substantially when doing type safe
-        call elpa_transpose_vectors_real  (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
-                                      tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
-                                      n, na, nblk, nblk)
-
-        do i=0,(na-1)/tile_size
-          lcs = max(l_colx,i*l_cols_tile+1)
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lrs = l_rowx
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<lrs) cycle
-          call DGEMM('N','T',lre-lrs+1,lce-lcs+1,nblk,-1.d0, &
-                      tmatr(lrs,1),ubound(tmatr,dim=1),tmatc(lcs,1),ubound(tmatc,dim=1), &
-                      1.d0,a(lrs,lcs),lda)
-        enddo
-
-      enddo
-
-      deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_real: error when deallocating tmp1 "//errorMessage
-        stop
-      endif
-
-      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)
-
-      do i=1,na
-        if (my_pcol==pcol(i, nblk, np_cols)) then
-          ! column i is on local processor
-          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
-          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
-          a(l_row1:l_rows,l_col1) = 0
-        endif
-      enddo
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("cholesky_real")
-#endif
-
-    end subroutine cholesky_real
-
-    subroutine invert_trm_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  invert_trm_real: Inverts a upper triangular matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be inverted.
-    !              Distribution is like in Scalapack.
-    !              Only upper triangle is needs to be set.
-    !              The lower triangle is not referenced.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-       use precision
-       implicit none
-
-       integer(kind=ik)             :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       real(kind=rk)                :: a(lda,*)
-#else
-       real(kind=rk)                :: a(lda,matrixCols)
-#endif
-       integer(kind=ik)             :: my_prow, my_pcol, np_rows, np_cols, mpierr
-       integer(kind=ik)             :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
-       integer(kind=ik)             :: n, nc, i, info, ns, nb
-
-       real(kind=rk), allocatable   :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
-
-       logical, intent(in)          :: wantDebug
-       logical, intent(out)         :: success
-       integer(kind=ik)             :: istat
-       character(200)               :: errorMessage
-       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-       success = .true.
-
-       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-
-       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_real: error when allocating tmp1 "//errorMessage
-         stop
-       endif
-
-       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_real: error when allocating tmp2 "//errorMessage
-         stop
-       endif
-
-       tmp1 = 0
-       tmp2 = 0
-
-       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_real: error when allocating tmat1 "//errorMessage
-         stop
-       endif
-
-       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_real: error when allocating tmat2 "//errorMessage
-         stop
-       endif
-
-       tmat1 = 0
-       tmat2 = 0
-
-
-       ns = ((na-1)/nblk)*nblk + 1
-
-       do n = ns,1,-nblk
-
-         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
-         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
-
-         nb = nblk
-         if (na-n+1 < nblk) nb = na-n+1
-
-         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
-         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)
-
-         if (my_prow==prow(n, nblk, np_rows)) then
-
-           if (my_pcol==pcol(n, nblk, np_cols)) then
-
-             call DTRTRI('U','N',nb,a(l_row1,l_col1),lda,info)
-             if (info/=0) then
-               if (wantDebug) write(error_unit,*) "ELPA1_invert_trm_real: Error in DTRTRI"
-               success = .false.
-               return
-             endif
-
-             nc = 0
-             do i=1,nb
-               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
-               nc = nc+i
-             enddo
-           endif
-#ifdef WITH_MPI
-           call MPI_Bcast(tmp1,nb*(nb+1)/2,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-           nc = 0
-           do i=1,nb
-             tmp2(1:i,i) = tmp1(nc+1:nc+i)
-             nc = nc+i
-           enddo
-
-           if (l_cols-l_colx+1>0) &
-               call DTRMM('L','U','N','N',nb,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
-
-           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
-           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0
-
-         endif
-
-         if (l_row1>1) then
-           if (my_pcol==pcol(n, nblk, np_cols)) then
-             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
-             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
-           endif
-
-           do i=1,nb
-#ifdef WITH_MPI
-             call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-           enddo
-         endif
-#ifdef WITH_MPI
-         if (l_cols-l_col1+1>0) &
-            call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
-#endif
-         if (l_row1>1 .and. l_cols-l_col1+1>0) &
-            call dgemm('N','N',l_row1-1,l_cols-l_col1+1,nb, -1.d0, &
-                       tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), &
-                       1.d0, a(1,l_col1),lda)
-
-       enddo
-
-       deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_real: error when deallocating tmp1 "//errorMessage
-         stop
-       endif
-
-    end subroutine invert_trm_real
-
-    subroutine cholesky_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  cholesky_complex: Cholesky factorization of a complex hermitian matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matriCols)    Distributed matrix which should be factorized.
-    !              Distribution is like in Scalapack.
-    !              Only upper triangle is needs to be set.
-    !              On return, the upper triangle contains the Cholesky factor
-    !              and the lower triangle is set to 0.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)                 :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)                 :: a(lda,*)
-#else
-      complex(kind=ck)                 :: a(lda,matrixCols)
-#endif
-      integer(kind=ik)                 :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)                 :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
-      integer(kind=ik)                 :: n, nc, i, info
-      integer(kind=ik)                 :: lcs, lce, lrs, lre
-      integer(kind=ik)                 :: tile_size, l_rows_tile, l_cols_tile
-
-      complex(kind=ck), allocatable    :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:)
-
-      logical, intent(in)              :: wantDebug
-      logical, intent(out)             :: success
-      integer(kind=ik)                 :: istat
-      character(200)                   :: errorMessage
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("cholesky_complex")
-#endif
-      success = .true.
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-      l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-      l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-
-      allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_complex: error when allocating tmp1 "//errorMessage
-        stop
-      endif
-
-      allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_complex: error when allocating tmp2 "//errorMessage
-        stop
-      endif
-
-      tmp1 = 0
-      tmp2 = 0
-
-      allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_complex: error when allocating tmatr "//errorMessage
-        stop
-      endif
-
-      allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_complex: error when allocating tmatc "//errorMessage
-        stop
-      endif
-
-      tmatr = 0
-      tmatc = 0
-
-      do n = 1, na, nblk
-
-        ! Calculate first local row and column of the still remaining matrix
-        ! on the local processor
-
-        l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
-        l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
-
-        l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1)
-        l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1)
-
-        if (n+nblk > na) then
-
-          ! This is the last step, just do a Cholesky-Factorization
-          ! of the remaining block
-
-          if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
-
-            call zpotrf('U',na-n+1,a(l_row1,l_col1),lda,info)
-            if (info/=0) then
-              if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf"
-              success = .false.
-              return
-            endif
-
-          endif
-
-          exit ! Loop
-        endif
-
-        if (my_prow==prow(n, nblk, np_rows)) then
-
-          if (my_pcol==pcol(n, nblk, np_cols)) then
-
-            ! The process owning the upper left remaining block does the
-            ! Cholesky-Factorization of this block
-
-            call zpotrf('U',nblk,a(l_row1,l_col1),lda,info)
-            if (info/=0) then
-              if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf"
-              success = .false.
-              return
-            endif
-
-            nc = 0
-            do i=1,nblk
-              tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
-              nc = nc+i
-            enddo
-          endif
-#ifdef WITH_MPI
-          call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-          nc = 0
-          do i=1,nblk
-            tmp2(1:i,i) = tmp1(nc+1:nc+i)
-            nc = nc+i
-          enddo
-
-          if (l_cols-l_colx+1>0) &
-                call ztrsm('L','U','C','N',nblk,l_cols-l_colx+1,(1.d0,0.d0),tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
-
-        endif
-
-        do i=1,nblk
-
-          if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols))
-#ifdef WITH_MPI
-          if (l_cols-l_colx+1>0) &
-                call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_DOUBLE_COMPLEX,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
-#endif
-        enddo
-        ! this has to be checked since it was changed substantially when doing type safe
-        call elpa_transpose_vectors_complex  (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, &
-                                        tmatr, ubound(tmatr,dim=1), mpi_comm_rows, &
-                                        n, na, nblk, nblk)
-        do i=0,(na-1)/tile_size
-          lcs = max(l_colx,i*l_cols_tile+1)
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lrs = l_rowx
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<lrs) cycle
-          call ZGEMM('N','C',lre-lrs+1,lce-lcs+1,nblk,(-1.d0,0.d0), &
-                        tmatr(lrs,1),ubound(tmatr,dim=1),tmatc(lcs,1),ubound(tmatc,dim=1), &
-                        (1.d0,0.d0),a(lrs,lcs),lda)
-        enddo
-
-      enddo
-
-      deallocate(tmp1, tmp2, tmatr, tmatc, stat=istat, errmsg=errorMessage)
-      if (istat .ne. 0) then
-        print *,"cholesky_complex: error when deallocating tmatr "//errorMessage
-        stop
-      endif
-
-      ! Set the lower triangle to 0, it contains garbage (form the above matrix multiplications)
-
-      do i=1,na
-        if (my_pcol==pcol(i, nblk, np_cols)) then
-          ! column i is on local processor
-          l_col1 = local_index(i  , my_pcol, np_cols, nblk, +1) ! local column number
-          l_row1 = local_index(i+1, my_prow, np_rows, nblk, +1) ! first row below diagonal
-          a(l_row1:l_rows,l_col1) = 0
-        endif
-      enddo
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("cholesky_complex")
-#endif
-
-    end subroutine cholesky_complex
-
-    subroutine invert_trm_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  invert_trm_complex: Inverts a upper triangular matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be inverted.
-    !              Distribution is like in Scalapack.
-    !              Only upper triangle is needs to be set.
-    !              The lower triangle is not referenced.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-       use precision
-       implicit none
-
-       integer(kind=ik)                 :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       complex(kind=ck)                 :: a(lda,*)
-#else
-       complex(kind=ck)                 :: a(lda,matrixCols)
-#endif
-       integer(kind=ik)                 :: my_prow, my_pcol, np_rows, np_cols, mpierr
-       integer(kind=ik)                 :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
-       integer(kind=ik)                 :: n, nc, i, info, ns, nb
-
-       complex(kind=ck), allocatable    :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
-
-       logical, intent(in)              :: wantDebug
-       logical, intent(out)             :: success
-       integer(kind=ik)                 :: istat
-       character(200)                   :: errorMessage
-       call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-       call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-       call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-       call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-       success = .true.
-
-       l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-       l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
-
-       allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_complex: error when allocating tmp1 "//errorMessage
-         stop
-       endif
-
-       allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_complex: error when allocating tmp2 "//errorMessage
-         stop
-       endif
-
-       tmp1 = 0
-       tmp2 = 0
-
-       allocate(tmat1(l_rows,nblk), stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_complex: error when allocating tmat1 "//errorMessage
-         stop
-       endif
-
-       allocate(tmat2(nblk,l_cols), stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_complex: error when allocating tmat2 "//errorMessage
-         stop
-       endif
-
-       tmat1 = 0
-       tmat2 = 0
-
-       ns = ((na-1)/nblk)*nblk + 1
-
-       do n = ns,1,-nblk
-
-         l_row1 = local_index(n, my_prow, np_rows, nblk, +1)
-         l_col1 = local_index(n, my_pcol, np_cols, nblk, +1)
-
-         nb = nblk
-         if (na-n+1 < nblk) nb = na-n+1
-
-         l_rowx = local_index(n+nb, my_prow, np_rows, nblk, +1)
-         l_colx = local_index(n+nb, my_pcol, np_cols, nblk, +1)
-
-         if (my_prow==prow(n, nblk, np_rows)) then
-
-           if (my_pcol==pcol(n, nblk, np_cols)) then
-
-             call ZTRTRI('U','N',nb,a(l_row1,l_col1),lda,info)
-             if (info/=0) then
-               if (wantDebug) write(error_unit,*) "ELPA1_invert_trm_complex: Error in ZTRTRI"
-               success = .false.
-               return
-             endif
-
-             nc = 0
-             do i=1,nb
-               tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1)
-               nc = nc+i
-             enddo
-           endif
-
-#ifdef WITH_MPI
-           call MPI_Bcast(tmp1,nb*(nb+1)/2,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-           nc = 0
-           do i=1,nb
-             tmp2(1:i,i) = tmp1(nc+1:nc+i)
-             nc = nc+i
-           enddo
-
-           if (l_cols-l_colx+1>0) &
-             call ZTRMM('L','U','N','N',nb,l_cols-l_colx+1,(1.d0,0.d0),tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda)
-
-           if (l_colx<=l_cols)   tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
-           if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0
-
-         endif
-
-         if (l_row1>1) then
-           if (my_pcol==pcol(n, nblk, np_cols)) then
-             tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1)
-             a(1:l_row1-1,l_col1:l_col1+nb-1) = 0
-           endif
-
-           do i=1,nb
-#ifdef WITH_MPI
-             call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-           enddo
-         endif
-#ifdef WITH_MPI
-         if (l_cols-l_col1+1>0) &
-           call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_DOUBLE_COMPLEX,prow(n, nblk, np_rows),mpi_comm_rows,mpierr)
-#endif
-         if (l_row1>1 .and. l_cols-l_col1+1>0) &
-           call ZGEMM('N','N',l_row1-1,l_cols-l_col1+1,nb, (-1.d0,0.d0), &
-                        tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), &
-                        (1.d0,0.d0), a(1,l_col1),lda)
-
-       enddo
-
-       deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage)
-       if (istat .ne. 0) then
-         print *,"invert_trm_complex: error when deallocating tmp1 "//errorMessage
-         stop
-       endif
-    end subroutine invert_trm_complex
-
-    integer function least_common_multiple(a, b)
-
-      ! Returns the least common multiple of a and b
-      ! There may be more efficient ways to do this, we use the most simple approach
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: a, b
-
-      do least_common_multiple = a, a*(b-1), a
-        if(mod(least_common_multiple,b)==0) exit
-      enddo
-      ! if the loop is left regularly, least_common_multiple = a*b
-
-    end function
-
-    subroutine hh_transform_real(alpha, xnorm_sq, xf, tau)
-      ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
-      ! and returns the factor xf by which x has to be scaled.
-      ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150
-      ! since this would be expensive for the parallel implementation.
-      use precision
-      implicit none
-      real(kind=rk), intent(inout) :: alpha
-      real(kind=rk), intent(in)    :: xnorm_sq
-      real(kind=rk), intent(out)   :: xf, tau
-
-      real(kind=rk)                :: BETA
-
-      if ( XNORM_SQ==0. ) then
-
-        if ( ALPHA>=0. ) then
-          TAU = 0.
-        else
-          TAU = 2.
-          ALPHA = -ALPHA
-        endif
-        XF = 0.
-
-      else
-
-        BETA = SIGN( SQRT( ALPHA**2 + XNORM_SQ ), ALPHA )
-        ALPHA = ALPHA + BETA
-        IF ( BETA<0 ) THEN
-          BETA = -BETA
-          TAU = -ALPHA / BETA
-        ELSE
-          ALPHA = XNORM_SQ / ALPHA
-          TAU = ALPHA / BETA
-          ALPHA = -ALPHA
-       END IF
-       XF = 1./ALPHA
-       ALPHA = BETA
-     endif
-
-    end subroutine
-
-    subroutine hh_transform_complex(alpha, xnorm_sq, xf, tau)
-
-      ! Similar to LAPACK routine ZLARFP, but uses ||x||**2 instead of x(:)
-      ! and returns the factor xf by which x has to be scaled.
-      ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150
-      ! since this would be expensive for the parallel implementation.
-      use precision
-      implicit none
-      complex(kind=ck), intent(inout) :: alpha
-      real(kind=rk), intent(in)       :: xnorm_sq
-      complex(kind=ck), intent(out)   :: xf, tau
-
-      real*8 ALPHR, ALPHI, BETA
-
-      ALPHR = DBLE( ALPHA )
-      ALPHI = DIMAG( ALPHA )
-
-      if ( XNORM_SQ==0. .AND. ALPHI==0. ) then
-
-        if ( ALPHR>=0. ) then
-          TAU = 0.
-        else
-          TAU = 2.
-          ALPHA = -ALPHA
-        endif
-        XF = 0.
-
-      else
-
-        BETA = SIGN( SQRT( ALPHR**2 + ALPHI**2 + XNORM_SQ ), ALPHR )
-        ALPHA = ALPHA + BETA
-        IF ( BETA<0 ) THEN
-          BETA = -BETA
-          TAU = -ALPHA / BETA
-        ELSE
-          ALPHR = ALPHI * (ALPHI/DBLE( ALPHA ))
-          ALPHR = ALPHR + XNORM_SQ/DBLE( ALPHA )
-          TAU = DCMPLX( ALPHR/BETA, -ALPHI/BETA )
-          ALPHA = DCMPLX( -ALPHR, ALPHI )
-        END IF
-        XF = 1./ALPHA
-        ALPHA = BETA
-      endif
-
-    end subroutine
-
-end module ELPA1_compute
diff -Nru elpa-2016.05.001/src/elpa1.F90 elpa-2019.11.001/src/elpa1.F90
--- elpa-2016.05.001/src/elpa1.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa1.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,478 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!    This particular source code file contains additions, changes and
-!    enhancements authored by Intel Corporation which is not part of
-!    the ELPA consortium.
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-!> \mainpage
-!> Eigenvalue SoLvers for Petaflop-Applications (ELPA)
-!> \par
-!> http://elpa.mpcdf.mpg.de
-!>
-!> \par
-!>    The ELPA library was originally created by the ELPA consortium,
-!>    consisting of the following organizations:
-!>
-!>    - Max Planck Computing and Data Facility (MPCDF) formerly known as
-!>      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!>    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!>      Informatik,
-!>    - Technische Universität München, Lehrstuhl für Informatik mit
-!>      Schwerpunkt Wissenschaftliches Rechnen ,
-!>    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!>    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!>      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!>      and
-!>    - IBM Deutschland GmbH
-!>
-!>   Some parts and enhancements of ELPA have been contributed and authored
-!>   by the Intel Corporation which is not part of the ELPA consortium.
-!>
-!>   Contributions to the ELPA source have been authored by (in alphabetical order):
-!>
-!> \author T. Auckenthaler, Volker Blum, A. Heinecke, L. Huedepohl, R. Johanni, Werner Jürgens, and A. Marek
-
-
-#include "config-f90.h"
-!> \brief Fortran module which provides the routines to use the one-stage ELPA solver
-module ELPA1
-  use precision
-  use elpa_utilities
-  use elpa1_compute
-
-#ifdef HAVE_DETAILED_TIMINGS
-  use timings
-#endif
-
-  use elpa_mpi
-  implicit none
-
-  PRIVATE ! By default, all routines contained are private
-
-  ! The following routines are public:
-
-  public :: get_elpa_row_col_comms     !< old, deprecated interface: Sets MPI row/col communicators
-  public :: get_elpa_communicators     !< Sets MPI row/col communicators
-
-  public :: solve_evp_real             !< old, deprecated interface: Driver routine for real eigenvalue problem
-  public :: solve_evp_real_1stage      !< Driver routine for real eigenvalue problem
-  public :: solve_evp_complex          !< old, deprecated interface:  Driver routine for complex eigenvalue problem
-  public :: solve_evp_complex_1stage   !< Driver routine for complex eigenvalue problem
-
-  ! Timing results, set by every call to solve_evp_xxx
-
-  real(kind=rk), public :: time_evp_fwd    !< time for forward transformations (to tridiagonal form)
-  real(kind=rk), public :: time_evp_solve  !< time for solving the tridiagonal system
-  real(kind=rk), public :: time_evp_back   !< time for back transformations of eigenvectors
-
-  logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs
-
-
-!> \brief get_elpa_row_col_comms:  old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators"
-!> \detail
-!> The interface and variable definition is the same as in "elpa_get_communicators"
-!> \param  mpi_comm_global   Global communicator for the calculations (in)
-!>
-!> \param  my_prow           Row coordinate of the calling process in the process grid (in)
-!>
-!> \param  my_pcol           Column coordinate of the calling process in the process grid (in)
-!>
-!> \param  mpi_comm_rows     Communicator for communicating within rows of processes (out)
-!>
-!> \param  mpi_comm_cols     Communicator for communicating within columns of processes (out)
-!> \result mpierr            integer error value of mpi_comm_split function
-  interface get_elpa_row_col_comms
-    module procedure get_elpa_communicators
-  end interface
-
-!> \brief solve_evp_real: old, deprecated Fortran function to solve the real eigenvalue problem with 1-stage solver. Better use "solve_evp_real_1stage"
-!>
-!> \detail
-!>  The interface and variable definition is the same as in "elpa_solve_evp_real_1stage"
-!  Parameters
-!
-!> \param  na                   Order of matrix a
-!>
-!> \param  nev                  Number of eigenvalues needed.
-!>                              The smallest nev eigenvalues/eigenvectors are calculated.
-!>
-!> \param  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
-!>                              Distribution is like in Scalapack.
-!>                              The full matrix must be set (not only one half like in scalapack).
-!>                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                  Leading dimension of a
-!>
-!>  \param ev(na)               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)    On output: Eigenvectors of a
-!>                              Distribution is like in Scalapack.
-!>                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                  Leading dimension of q
-!>
-!>  \param nblk                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols           distributed number of matrix columns
-!>
-!>  \param mpi_comm_rows        MPI-Communicator for rows
-!>  \param mpi_comm_cols        MPI-Communicator for columns
-!>
-!>  \result                     success
-
-
-  interface solve_evp_real
-    module procedure solve_evp_real_1stage
-  end interface
-
-!> \brief solve_evp_complex: old, deprecated Fortran function to solve the complex eigenvalue problem with 1-stage solver. Better use "solve_evp_complex_1stage"
-!>
-!> \detail
-!> The interface and variable definition is the same as in "elpa_solve_evp_complex_1stage"
-!  Parameters
-!
-!> \param  na                   Order of matrix a
-!>
-!> \param  nev                  Number of eigenvalues needed.
-!>                              The smallest nev eigenvalues/eigenvectors are calculated.
-!>
-!> \param  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
-!>                              Distribution is like in Scalapack.
-!>                              The full matrix must be set (not only one half like in scalapack).
-!>                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                  Leading dimension of a
-!>
-!>  \param ev(na)               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)    On output: Eigenvectors of a
-!>                              Distribution is like in Scalapack.
-!>                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                  Leading dimension of q
-!>
-!>  \param nblk                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols           distributed number of matrix columns
-!>
-!>  \param mpi_comm_rows        MPI-Communicator for rows
-!>  \param mpi_comm_cols        MPI-Communicator for columns
-!>
-!>  \result                     success
-
-
-  interface solve_evp_complex
-    module procedure solve_evp_complex_1stage
-  end interface
-
-contains
-
-!-------------------------------------------------------------------------------
-
-!> \brief Fortran function to create the MPI communicators for ELPA.
-! All ELPA routines need MPI communicators for communicating within
-! rows or columns of processes, these are set here.
-! mpi_comm_rows/mpi_comm_cols can be free'd with MPI_Comm_free if not used any more.
-!
-!  Parameters
-!
-!> \param  mpi_comm_global   Global communicator for the calculations (in)
-!>
-!> \param  my_prow           Row coordinate of the calling process in the process grid (in)
-!>
-!> \param  my_pcol           Column coordinate of the calling process in the process grid (in)
-!>
-!> \param  mpi_comm_rows     Communicator for communicating within rows of processes (out)
-!>
-!> \param  mpi_comm_cols     Communicator for communicating within columns of processes (out)
-!> \result mpierr            integer error value of mpi_comm_split function
-
-
-function get_elpa_communicators(mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols) result(mpierr)
-   use precision
-   implicit none
-
-   integer(kind=ik), intent(in)  :: mpi_comm_global, my_prow, my_pcol
-   integer(kind=ik), intent(out) :: mpi_comm_rows, mpi_comm_cols
-
-   integer(kind=ik)              :: mpierr
-
-   ! mpi_comm_rows is used for communicating WITHIN rows, i.e. all processes
-   ! having the same column coordinate share one mpi_comm_rows.
-   ! So the "color" for splitting is my_pcol and the "key" is my row coordinate.
-   ! Analogous for mpi_comm_cols
-
-   call mpi_comm_split(mpi_comm_global,my_pcol,my_prow,mpi_comm_rows,mpierr)
-   call mpi_comm_split(mpi_comm_global,my_prow,my_pcol,mpi_comm_cols,mpierr)
-
-end function get_elpa_communicators
-
-
-!> \brief solve_evp_real_1stage: Fortran function to solve the real eigenvalue problem with 1-stage solver
-!>
-!  Parameters
-!
-!> \param  na                   Order of matrix a
-!>
-!> \param  nev                  Number of eigenvalues needed.
-!>                              The smallest nev eigenvalues/eigenvectors are calculated.
-!>
-!> \param  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
-!>                              Distribution is like in Scalapack.
-!>                              The full matrix must be set (not only one half like in scalapack).
-!>                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                  Leading dimension of a
-!>
-!>  \param ev(na)               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)    On output: Eigenvectors of a
-!>                              Distribution is like in Scalapack.
-!>                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                  Leading dimension of q
-!>
-!>  \param nblk                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols           distributed number of matrix columns
-!>
-!>  \param mpi_comm_rows        MPI-Communicator for rows
-!>  \param mpi_comm_cols        MPI-Communicator for columns
-!>
-!>  \result                     success
-
-
-function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
-   use precision
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   implicit none
-
-   integer(kind=ik), intent(in)  :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-   real(kind=rk)                 :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
-   ! was
-   ! real a(lda,*), q(ldq,*)
-
-   integer(kind=ik)              :: my_prow, my_pcol, mpierr
-   real(kind=rk), allocatable    :: e(:), tau(:)
-   real(kind=rk)                 :: ttt0, ttt1
-   logical                       :: success
-   logical, save                 :: firstCall = .true.
-   logical                       :: wantDebug
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("solve_evp_real_1stage")
-#endif
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-
-   success = .true.
-
-   wantDebug = .false.
-   if (firstCall) then
-     ! are debug messages desired?
-     wantDebug = debug_messages_via_environment_variable()
-     firstCall = .false.
-   endif
-
-   allocate(e(na), tau(na))
-
-   ttt0 = MPI_Wtime()
-   call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
-
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
-   time_evp_fwd = ttt1-ttt0
-
-   ttt0 = MPI_Wtime()
-   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
-                    mpi_comm_cols, wantDebug, success)
-   if (.not.(success)) return
-
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi  :',ttt1-ttt0
-   time_evp_solve = ttt1-ttt0
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
-   time_evp_back = ttt1-ttt0
-
-   deallocate(e, tau)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("solve_evp_real_1stage")
-#endif
-
-end function solve_evp_real_1stage
-
-
-!> \brief solve_evp_complex_1stage: Fortran function to solve the complex eigenvalue problem with 1-stage solver
-!>
-!  Parameters
-!
-!> \param  na                   Order of matrix a
-!>
-!> \param  nev                  Number of eigenvalues needed.
-!>                              The smallest nev eigenvalues/eigenvectors are calculated.
-!>
-!> \param  a(lda,matrixCols)    Distributed matrix for which eigenvalues are to be computed.
-!>                              Distribution is like in Scalapack.
-!>                              The full matrix must be set (not only one half like in scalapack).
-!>                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                  Leading dimension of a
-!>
-!>  \param ev(na)               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)    On output: Eigenvectors of a
-!>                              Distribution is like in Scalapack.
-!>                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                  Leading dimension of q
-!>
-!>  \param nblk                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols           distributed number of matrix columns
-!>
-!>  \param mpi_comm_rows        MPI-Communicator for rows
-!>  \param mpi_comm_cols        MPI-Communicator for columns
-!>
-!>  \result                     success
-
-function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   use precision
-   implicit none
-
-   integer(kind=ik), intent(in)     :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-   complex(kind=ck)                 :: a(lda,matrixCols), q(ldq,matrixCols)
-   ! was
-   ! complex a(lda,*), q(ldq,*)
-   real(kind=rk)                    :: ev(na)
-
-   integer(kind=ik)                 :: my_prow, my_pcol, np_rows, np_cols, mpierr
-   integer(kind=ik)                 :: l_rows, l_cols, l_cols_nev
-   real(kind=rk), allocatable       :: q_real(:,:), e(:)
-   complex(kind=ck), allocatable    :: tau(:)
-   real(kind=rk)                    :: ttt0, ttt1
-
-   logical                          :: success
-   logical, save                    :: firstCall = .true.
-   logical                          :: wantDebug
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("solve_evp_complex_1stage")
-#endif
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-   success = .true.
-
-   wantDebug = .false.
-   if (firstCall) then
-     ! are debug messages desired?
-     wantDebug = debug_messages_via_environment_variable()
-     firstCall = .false.
-   endif
-
-
-   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
-   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
-
-   allocate(e(na), tau(na))
-   allocate(q_real(l_rows,l_cols))
-
-   ttt0 = MPI_Wtime()
-   call tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0
-   time_evp_fwd = ttt1-ttt0
-
-   ttt0 = MPI_Wtime()
-   call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, &
-                    mpi_comm_cols, wantDebug, success)
-   if (.not.(success)) return
-
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi     :',ttt1-ttt0
-   time_evp_solve = ttt1-ttt0
-
-   ttt0 = MPI_Wtime()
-   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
-
-   call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-   ttt1 = MPI_Wtime()
-   if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0
-   time_evp_back = ttt1-ttt0
-
-   deallocate(q_real)
-   deallocate(e, tau)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("solve_evp_complex_1stage")
-#endif
-
-end function solve_evp_complex_1stage
-
-
-
-end module ELPA1
diff -Nru elpa-2016.05.001/src/elpa2/compute_hh_trafo.F90 elpa-2019.11.001/src/elpa2/compute_hh_trafo.F90
--- elpa-2016.05.001/src/elpa2/compute_hh_trafo.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/compute_hh_trafo.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,2628 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+#endif
+
+       subroutine compute_hh_trafo_&
+       &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+       &_openmp_&
+#else
+       &_&
+#endif
+       &PRECISION &
+       (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, &
+#ifdef WITH_OPENMP
+       l_nev, &
+#endif
+       a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
+#if REALCASE == 1
+       hh_dot_dev, &
+#endif
+       hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
+#ifdef WITH_OPENMP
+       my_thread, thread_width, &
+#else
+       last_stripe_width, &
+#endif
+       kernel)
+
+         use precision
+         use elpa_abstract_impl
+         use iso_c_binding
+#if REALCASE == 1
+
+         use single_hh_trafo_real
+#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
+         use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
+#endif
+
+#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
+         use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
+#endif
+
+!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
+!         use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
+!#endif
+
+#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
+         use real_generic_kernel !, only : double_hh_trafo_generic
+#endif
+
+#if defined(WITH_REAL_BGP_KERNEL)
+         use real_bgp_kernel !, only : double_hh_trafo_bgp
+#endif
+
+#if defined(WITH_REAL_BGQ_KERNEL)
+         use real_bgq_kernel !, only : double_hh_trafo_bgq
+#endif
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+
+#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
+           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
+#endif
+#if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
+           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
+#endif
+
+#endif /* COMPLEXCASE */
+
+         use cuda_c_kernel
+         use cuda_functions
+
+         use elpa_generated_fortran_interfaces
+
+         implicit none
+         class(elpa_abstract_impl_t), intent(inout) :: obj
+         logical, intent(in)                        :: useGPU, wantDebug
+         real(kind=c_double), intent(inout)         :: kernel_time  ! MPI_WTIME always needs double
+         integer(kind=lik)                          :: kernel_flops
+         integer(kind=ik), intent(in)               :: nbw, max_blk_size
+#if REALCASE == 1
+         real(kind=C_DATATYPE_KIND)                 :: bcast_buffer(nbw,max_blk_size)
+#endif
+#if COMPLEXCASE == 1
+         complex(kind=C_DATATYPE_KIND)              :: bcast_buffer(nbw,max_blk_size)
+#endif
+         integer(kind=ik), intent(in)               :: a_off
+
+         integer(kind=ik), intent(in)               :: stripe_width,a_dim2,stripe_count
+
+         integer(kind=ik), intent(in)               :: max_threads
+#ifndef WITH_OPENMP
+         integer(kind=ik), intent(in)               :: last_stripe_width
+#if REALCASE == 1
+!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count)
+         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:)
+#endif
+#if COMPLEXCASE == 1
+!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count)
+          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:)
+#endif
+
+#else /* WITH_OPENMP */
+         integer(kind=ik), intent(in)               :: l_nev, thread_width
+#if REALCASE == 1
+!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
+         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:,:)
+ 
+#endif
+#if COMPLEXCASE == 1
+!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
+          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:,:)
+#endif
+
+#endif /* WITH_OPENMP */
+
+         integer(kind=ik), intent(in)               :: kernel
+
+         integer(kind=c_intptr_t)                   :: a_dev
+   integer(kind=c_intptr_t)                         :: bcast_buffer_dev
+#if REALCASE == 1
+         integer(kind=c_intptr_t)                   :: hh_dot_dev ! why not needed in complex case
+#endif
+         integer(kind=c_intptr_t)                   :: hh_tau_dev
+         integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
+
+         ! Private variables in OMP regions (my_thread) should better be in the argument list!
+         integer(kind=ik)                           :: off, ncols, istripe
+#ifdef WITH_OPENMP
+         integer(kind=ik)                           :: my_thread, noff
+#endif
+         integer(kind=ik)                           :: j, nl, jj, jjj, n_times
+#if REALCASE == 1
+         real(kind=C_DATATYPE_KIND)                 :: w(nbw,6)
+#endif
+#if COMPLEXCASE == 1
+         complex(kind=C_DATATYPE_KIND)              :: w(nbw,2)
+#endif
+         real(kind=c_double)                        :: ttt ! MPI_WTIME always needs double
+
+         j = -99
+
+         if (wantDebug) then
+           if (useGPU .and. &
+#if REALCASE == 1
+             ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
+#endif
+#if COMPLEXCASE == 1
+             ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
+#endif
+             print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
+             stop
+           endif
+         endif
+
+#if REALCASE == 1
+         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
+#endif
+#if COMPLEXCASE == 1
+         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
+#endif
+           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
+           if (ncols < 1) then
+             if (wantDebug) then
+               print *, "Returning early from compute_hh_trafo"
+             endif
+             return
+           endif
+         endif
+
+         if (wantDebug) call obj%timer%start("compute_hh_trafo_&
+                                              &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+                                              &_openmp" // &
+#else
+                                              &" // &
+#endif
+                                              &PRECISION_SUFFIX &
+                                              )
+
+
+#ifdef WITH_OPENMP
+         if (my_thread==1) then
+#endif
+           ttt = mpi_wtime()
+#ifdef WITH_OPENMP
+         endif
+#endif
+
+#ifdef WITH_OPENMP
+
+#if REALCASE == 1
+         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
+           print *,"compute_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_GPU OPENMP: not yet implemented"
+           stop 1
+         endif
+#endif
+#if COMPLEXCASE == 1
+         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
+           print *,"compute_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_GPU OPENMP: not yet implemented"
+           stop 1
+         endif
+#endif
+#endif /* WITH_OPENMP */
+
+#ifndef WITH_OPENMP
+         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
+#else /* WITH_OPENMP */
+
+         if (istripe<stripe_count) then
+           nl = stripe_width
+         else
+           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
+           nl = min(my_thread*thread_width-noff, l_nev-noff)
+           if (nl<=0) then
+             if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
+                                                &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+                                                &_openmp" // &
+#else
+                                                &" // &
+#endif
+                                                &PRECISION_SUFFIX &
+                                                )
+
+             return
+           endif
+         endif
+#endif /* not WITH_OPENMP */
+
+#if REALCASE == 1
+! GPU kernel real
+         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
+           if (wantDebug) then
+             call obj%timer%start("compute_hh_trafo: GPU")
+           endif
+           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
+                  &PRECISION&
+                  &_&
+                  &MATH_DATATYPE
+
+           call launch_compute_hh_trafo_gpu_kernel_&
+                &MATH_DATATYPE&
+                &_&
+                &PRECISION&
+                & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
+#endif /* REALCASE */
+#if COMPLEXCASE == 1
+! GPU kernel complex
+         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
+           if (wantDebug) then
+             call obj%timer%start("compute_hh_trafo: GPU")
+           endif
+
+           dev_offset = (0 + ( (  a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
+                  &PRECISION&
+                  &_&
+                  &MATH_DATATYPE
+
+           dev_offset_1 = (0 +  (  off-1 )* nbw) * size_of_&
+                  &PRECISION&
+                  &_&
+                  &MATH_DATATYPE
+
+           dev_offset_2 =( off-1 )* size_of_&
+                  &PRECISION&
+                  &_&
+                  &MATH_DATATYPE
+
+           call launch_compute_hh_trafo_gpu_kernel_&
+                &MATH_DATATYPE&
+                &_&
+                &PRECISION&
+                & (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
+                                                         hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
+
+
+#endif /* COMPLEXCASE */
+           if (wantDebug) then
+             call obj%timer%stop("compute_hh_trafo: GPU")
+           endif
+
+         else ! not CUDA kernel
+
+           if (wantDebug) then
+             call obj%timer%start("compute_hh_trafo: CPU")
+           endif
+#if REALCASE == 1
+#ifndef WITH_FIXED_REAL_KERNEL
+         if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
+             kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
+             kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
+             kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
+             kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
+             kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
+             kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
+             kernel .eq. ELPA_2STAGE_REAL_GENERIC    .or. &
+             kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
+             kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
+             kernel .eq. ELPA_2STAGE_REAL_BGP .or.        &
+             kernel .eq. ELPA_2STAGE_REAL_BGQ) then
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#endif /* REALCASE */
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
+#if REALCASE == 1
+! generic kernel real case
+#if defined(WITH_REAL_GENERIC_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+               do j = ncols, 2, -2
+                 w(:,1) = bcast_buffer(1:nbw,j+off)
+                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
+
+#ifdef WITH_OPENMP
+
+#ifdef USE_ASSUMED_SIZE
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_generic_&
+                      &PRECISION&
+                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+
+#else
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_generic_&
+                      &PRECISION&
+                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
+                    nbw, nl, stripe_width, nbw)
+#endif
+
+#else /* WITH_OPENMP */
+
+#ifdef USE_ASSUMED_SIZE
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_generic_&
+                      &PRECISION&
+                      & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
+
+#else
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_generic_&
+                      &PRECISION&
+                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
+#endif
+#endif /* WITH_OPENMP */
+
+               enddo
+
+#ifndef WITH_FIXED_REAL_KERNEL
+             endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_GENERIC_KERNEL */
+
+#endif /* REALCASE == 1 */
+
+#if COMPLEXCASE == 1
+! generic kernel complex case
+#if defined(WITH_COMPLEX_GENERIC_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
+               kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
+               kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+             ttt = mpi_wtime()
+             do j = ncols, 1, -1
+#ifdef WITH_OPENMP
+#ifdef USE_ASSUMED_SIZE
+
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_generic_&
+                   &PRECISION&
+                   & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#else
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_generic_&
+                   &PRECISION&
+                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
+                     bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
+#endif
+
+#else /* WITH_OPENMP */
+
+#ifdef USE_ASSUMED_SIZE
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_generic_&
+                   &PRECISION&
+                   & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#else
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_generic_&
+                   &PRECISION&
+                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
+                      nbw, nl, stripe_width)
+#endif
+#endif /* WITH_OPENMP */
+
+            enddo
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_GENERIC_KERNEL */
+
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+
+
+! generic simple real kernel
+#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
+#endif /* not WITH_FIXED_REAL_KERNEL */
+               do j = ncols, 2, -2
+                 w(:,1) = bcast_buffer(1:nbw,j+off)
+                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+
+#ifdef USE_ASSUMED_SIZE
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_generic_simple_&
+                      &PRECISION&
+                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+#else
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_generic_simple_&
+                      &PRECISION&
+                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+
+#endif
+
+#else /* WITH_OPENMP */
+
+#ifdef USE_ASSUMED_SIZE
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_generic_simple_&
+                      &PRECISION&
+                      & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
+#else
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_generic_simple_&
+                      &PRECISION&
+                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
+
+#endif
+
+#endif /* WITH_OPENMP */
+
+               enddo
+#ifndef WITH_FIXED_REAL_KERNEL
+             endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+! generic simple complex case
+
+#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+            if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+             ttt = mpi_wtime()
+             do j = ncols, 1, -1
+#ifdef WITH_OPENMP
+#ifdef USE_ASSUMED_SIZE
+               call single_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#else
+               call single_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
+                       nbw, nl, stripe_width)
+#endif
+
+#else /* WITH_OPENMP */
+
+#ifdef USE_ASSUMED_SIZE
+               call single_hh_trafo_&
+                     &MATH_DATATYPE&
+                     &_generic_simple_&
+                     &PRECISION&
+                     & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#else
+               call single_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
+                       nbw, nl, stripe_width)
+#endif
+
+#endif /* WITH_OPENMP */
+             enddo
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
+
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+! sse assembly kernel real case
+#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+             if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+               do j = ncols, 2, -2
+                 w(:,1) = bcast_buffer(1:nbw,j+off)
+                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+                 call double_hh_trafo_&
+                 &MATH_DATATYPE&
+                 &_&
+                 &PRECISION&
+                 &_sse_assembly&
+                 & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+                 call double_hh_trafo_&
+                      &MATH_DATATYPE&
+                      &_&
+                      &PRECISION&
+                      &_sse_assembly&
+                      & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+               enddo
+#ifndef WITH_FIXED_REAL_KERNEL
+             endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+
+! sse assembly kernel complex case
+#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+             ttt = mpi_wtime()
+             do j = ncols, 1, -1
+#ifdef WITH_OPENMP
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_&
+                   &PRECISION&
+                   &_sse_assembly&
+                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#else
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_&
+                   &PRECISION&
+                   &_sse_assembly&
+                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#endif
+            enddo
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+! no sse, vsx, sparc64 block1 real kernel
+#endif
+
+#if COMPLEXCASE == 1
+
+! sparc64 block1 complex kernel
+#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
+!#ifndef WITH_FIXED_COMPLEX_KERNEL
+!          if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
+!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+!
+!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
+!            ttt = mpi_wtime()
+!            do j = ncols, 1, -1
+!#ifdef WITH_OPENMP
+!              call single_hh_trafo_&
+!                   &MATH_DATATYPE&
+!                   &_sparc64_1hv_&
+!                   &PRECISION&
+!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+!#else
+!              call single_hh_trafo_&
+!                   &MATH_DATATYPE&
+!                   &_sparc64_1hv_&
+!                   &PRECISION&
+!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+!#endif
+!            enddo
+!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
+!
+!#ifndef WITH_FIXED_COMPLEX_KERNEL
+!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
+!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */
+
+#endif /* COMPLEXCASE */
+
+
+#if COMPLEXCASE == 1
+
+! vsx block1 complex kernel
+#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
+!#ifndef WITH_FIXED_COMPLEX_KERNEL
+!          if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
+!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+!
+!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
+!            ttt = mpi_wtime()
+!            do j = ncols, 1, -1
+!#ifdef WITH_OPENMP
+!              call single_hh_trafo_&
+!                   &MATH_DATATYPE&
+!                   &_vsx_1hv_&
+!                   &PRECISION&
+!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+!#else
+!              call single_hh_trafo_&
+!                   &MATH_DATATYPE&
+!                   &_vsx_1hv_&
+!                   &PRECISION&
+!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+!#endif
+!            enddo
+!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
+!
+!#ifndef WITH_FIXED_COMPLEX_KERNEL
+!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
+!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */
+
+#endif /* COMPLEXCASE */
+
+
+#if COMPLEXCASE == 1
+
+! sse block1 complex kernel
+#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+          if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+
+#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
+            ttt = mpi_wtime()
+            do j = ncols, 1, -1
+#ifdef WITH_OPENMP
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_sse_1hv_&
+                   &PRECISION&
+                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#else
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_sse_1hv_&
+                   &PRECISION&
+                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#endif
+            enddo
+#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
+
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
+
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+!no avx block1 real kernel
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+
+! avx block1 complex kernel
+#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
+              (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+
+#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
+            ttt = mpi_wtime()
+            do j = ncols, 1, -1
+#ifdef WITH_OPENMP
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_avx_avx2_1hv_&
+                   &PRECISION&
+                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#else
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_avx_avx2_1hv_&
+                   &PRECISION&
+                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#endif
+            enddo
+#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
+
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */
+
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+! no avx512 block1 real kernel
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+
+! avx512 block1 complex kernel
+#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+
+#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
+            ttt = mpi_wtime()
+            do j = ncols, 1, -1
+#ifdef WITH_OPENMP
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_avx512_1hv_&
+                   &PRECISION&
+                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#else
+              call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_avx512_1hv_&
+                   &PRECISION&
+                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
+#endif
+            enddo
+#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
+
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
+#endif /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+! implementation of sparc64 block 2 real case
+#if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
+             do j = ncols, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_2hv_&
+                    &PRECISION &
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_2hv_&
+                    &PRECISION &
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */
+
+#endif /* REALCASE == 1 */
+
+#if REALCASE == 1
+! implementation of neon_arch64 block 2 real case
+#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL)
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL))
+             do j = ncols, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_2hv_&
+                    &PRECISION &
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_2hv_&
+                    &PRECISION &
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */
+
+#endif /* REALCASE == 1 */
+
+
+
+#if REALCASE == 1
+! implementation of vsx block 2 real case
+#if defined(WITH_REAL_VSX_BLOCK2_KERNEL)
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL))
+             do j = ncols, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_2hv_&
+                    &PRECISION &
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_2hv_&
+                    &PRECISION &
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_VSX_BLOCK2_KERNEL */
+
+#endif /* REALCASE == 1 */
+
+#if REALCASE == 1
+! implementation of sse block 2 real case
+#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
+             do j = ncols, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_2hv_&
+                    &PRECISION &
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_2hv_&
+                    &PRECISION &
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
+
+#endif /* REALCASE == 1 */
+
+
+#if COMPLEXCASE == 1
+! implementation of sparc64 block 2 complex case
+
+#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
+!#ifndef WITH_FIXED_COMPLEX_KERNEL
+!           if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
+!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+!
+!             ttt = mpi_wtime()
+!             do j = ncols, 2, -2
+!               w(:,1) = bcast_buffer(1:nbw,j+off)
+!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+!#ifdef WITH_OPENMP
+!               call double_hh_trafo_&
+!                    &MATH_DATATYPE&
+!                    &_sparc64_2hv_&
+!                    &PRECISION&
+!                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+!#else
+!               call double_hh_trafo_&
+!                    &MATH_DATATYPE&
+!                    &_sparc64_2hv_&
+!                    &PRECISION&
+!                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+!#endif
+!             enddo
+!#ifdef WITH_OPENMP
+!             if (j==1) call single_hh_trafo_&
+!                 &MATH_DATATYPE&
+!                       &_sparc64_1hv_&
+!                       &PRECISION&
+!                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+!#else
+!             if (j==1) call single_hh_trafo_&
+!                 &MATH_DATATYPE&
+!                            &_sparc64_1hv_&
+!                            &PRECISION&
+!                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+!#endif
+!
+!#ifndef WITH_FIXED_COMPLEX_KERNEL
+!           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
+!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
+#endif /* COMPLEXCASE == 1 */
+
+
+#if COMPLEXCASE == 1
+! implementation of vsx block 2 complex case
+
+#if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
+!#ifndef WITH_FIXED_COMPLEX_KERNEL
+!           if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2) then
+!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+!
+!             ttt = mpi_wtime()
+!             do j = ncols, 2, -2
+!               w(:,1) = bcast_buffer(1:nbw,j+off)
+!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+!#ifdef WITH_OPENMP
+!               call double_hh_trafo_&
+!                    &MATH_DATATYPE&
+!                    &_vsx_2hv_&
+!                    &PRECISION&
+!                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+!#else
+!               call double_hh_trafo_&
+!                    &MATH_DATATYPE&
+!                    &_vsx_2hv_&
+!                    &PRECISION&
+!                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+!#endif
+!             enddo
+!#ifdef WITH_OPENMP
+!             if (j==1) call single_hh_trafo_&
+!                 &MATH_DATATYPE&
+!                       &_vsx_1hv_&
+!                       &PRECISION&
+!                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+!#else
+!             if (j==1) call single_hh_trafo_&
+!                 &MATH_DATATYPE&
+!                            &_vsx_1hv_&
+!                            &PRECISION&
+!                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+!#endif
+!
+!#ifndef WITH_FIXED_COMPLEX_KERNEL
+!           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
+!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
+#endif /* COMPLEXCASE == 1 */
+
+#if COMPLEXCASE == 1
+! implementation of sse block 2 complex case
+
+#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2) then
+#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+
+             ttt = mpi_wtime()
+             do j = ncols, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (j==1) call single_hh_trafo_&
+                 &MATH_DATATYPE&
+                       &_sse_1hv_&
+                       &PRECISION&
+                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+#else
+             if (j==1) call single_hh_trafo_&
+                 &MATH_DATATYPE&
+                            &_sse_1hv_&
+                            &PRECISION&
+                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+#endif
+
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
+#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
+#endif /* COMPLEXCASE == 1 */
+
+#if REALCASE == 1
+! implementation of avx block 2 real case
+
+#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+
+           if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2) .or. &
+               (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2))  then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
+               do j = ncols, 2, -2
+                 w(:,1) = bcast_buffer(1:nbw,j+off)
+                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+               enddo
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+             endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_AVX_BLOCK2_KERNEL || WITH_REAL_AVX2_BLOCK2_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+
+! implementation of avx block 2 complex case
+#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. &
+                (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) ) then
+#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+
+              ttt = mpi_wtime()
+             do j = ncols, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (j==1) call single_hh_trafo_&
+                 &MATH_DATATYPE&
+                 &_avx_avx2_1hv_&
+                 &PRECISION&
+                 & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+#else
+             if (j==1) call single_hh_trafo_&
+                 &MATH_DATATYPE&
+                 &_avx_avx2_1hv_&
+                 &PRECISION&
+                 & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+#endif
+
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) )
+#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */
+
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+! implementation of avx512 block 2 real case
+
+#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+
+           if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2)) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK4_KERNEL))
+               do j = ncols, 2, -2
+                 w(:,1) = bcast_buffer(1:nbw,j+off)
+                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+               enddo
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+             endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+
+! implementation of avx512 block 2 complex case
+#if defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2)) then
+#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+
+              ttt = mpi_wtime()
+             do j = ncols, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (j==1) call single_hh_trafo_&
+                 &MATH_DATATYPE&
+                 &_avx512_1hv_&
+                 &PRECISION&
+                 & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+#else
+             if (j==1) call single_hh_trafo_&
+                 &MATH_DATATYPE&
+                 &_avx512_1hv_&
+                 &PRECISION&
+                 & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
+#endif
+
+#ifndef WITH_FIXED_COMPLEX_KERNEL
+           endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2))
+#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
+#endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
+#endif /* COMPLEXCASE */
+
+
+#if REALCASE == 1
+
+#if defined(WITH_REAL_BGP_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+             if (kernel .eq. ELPA_2STAGE_REAL_BGP) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+               do j = ncols, 2, -2
+                 w(:,1) = bcast_buffer(1:nbw,j+off)
+                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+                 call double_hh_trafo_bgp_&
+                      &PRECISION&
+                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+#else
+                 call double_hh_trafo_bgp_&
+                      &PRECISION&
+                      & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
+#endif
+               enddo
+#ifndef WITH_FIXED_REAL_KERNEL
+             endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_BGP_KERNEL */
+
+#if defined(WITH_REAL_BGQ_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+             if (kernel .eq. ELPA_2STAGE_REAL_BGQ) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+               do j = ncols, 2, -2
+                 w(:,1) = bcast_buffer(1:nbw,j+off)
+                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
+#ifdef WITH_OPENMP
+                 call double_hh_trafo_bgq_&
+                      &PRECISION&
+                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+#else
+                 call double_hh_trafo_bgq_&PRECISION&
+                      & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
+#endif
+               enddo
+#ifndef WITH_FIXED_REAL_KERNEL
+             endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_BGQ_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+! complex bgp/bgq kernel implemented
+#endif
+
+
+#if REALCASE == 1
+#ifdef WITH_OPENMP
+             if (j==1) call single_hh_trafo_&
+                 &MATH_DATATYPE&
+                 &_cpu_openmp_&
+                 &PRECISION&
+                 & (a(1:stripe_width, 1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
+                          bcast_buffer(1:nbw,off+1), nbw, nl,stripe_width)
+#else
+             if (j==1) call single_hh_trafo_&
+                 &MATH_DATATYPE&
+                 &_cpu_&
+                 &PRECISION&
+                 & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
+                          stripe_width)
+#endif
+
+#endif /* REALCASE == 1 */
+
+#if REALCASE == 1
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif !
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* REALCASE == 1 */
+
+#if REALCASE == 1
+! generic simple block4 real kernel
+
+#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL))
+             do j = ncols, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+#ifdef WITH_OPENMP
+
+#ifdef USE_ASSUMED_SIZE
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_4hv_&
+                    &PRECISION&
+                    & (a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_4hv_&
+                    &PRECISION&
+                    & (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe,my_thread), w(1:nbw,1:6), nbw, nl, &
+                       stripe_width, nbw)
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_4hv_&
+                    &PRECISION&
+                    & (a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_4hv_&
+                    &PRECISION&
+                    & (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), nbw, nl, &
+                       stripe_width, nbw)
+#endif
+
+#endif
+             enddo
+
+             do jj = j, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+#ifdef WITH_OPENMP
+
+#ifdef USE_ASSUMED_SIZE
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe,my_thread), w(1:nbw,1:6), nbw, &
+                       nl, stripe_width, nbw)
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), &
+                       nbw, nl, stripe_width, nbw)
+#endif
+
+#endif
+             enddo
+#ifdef WITH_OPENMP
+
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_openmp_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+
+#else
+
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), &
+                     nbw, nl, stripe_width)
+#endif
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL */
+
+#endif /* REALCASE */
+
+#if REALCASE == 1
+!real generic simple block6 kernel
+#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
+             do j = ncols, 6, -6
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+               w(:,5) = bcast_buffer(1:nbw,j+off-4)
+               w(:,6) = bcast_buffer(1:nbw,j+off-5)
+
+#ifdef WITH_OPENMP
+
+!#ifdef USE_ASSUMED_SIZE
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_6hv_&
+                    &PRECISION&
+                    & (a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+!#else
+!               call hexa_hh_trafo_&
+!                    &MATH_DATATYPE&
+!                    &_generic_simple_6hv_&
+!                    &PRECISION&
+!                    & (a(1:stripe_width,j+off+a_off-5:j+off+a_off-1,istripe,my_thread), w(1:nbw,1:6), &
+!                       nbw, nl, stripe_width, nbw)
+!#endif
+
+#else /* WITH_OPENMP */
+!#ifdef USE_ASSUMED_SIZE
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_6hv_&
+                    &PRECISION&
+                    & (a(1,j+off+a_off-5,istripe), w, nbw, nl, stripe_width, nbw)
+!#else
+!               call hexa_hh_trafo_&
+!                    &MATH_DATATYPE&
+!                    &_generic_simple_6hv_&
+!                    &PRECISION&
+!                    & (a(1:stripe_width,j+off+a_off-5:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), &
+!                       nbw, nl, stripe_width, nbw)
+!#endif
+#endif /* WITH_OPENMP */
+             enddo
+             do jj = j, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
+               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
+#ifdef WITH_OPENMP
+
+#ifdef USE_ASSUMED_SIZE
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_4hv_&
+                    &PRECISION&
+                    & (a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_4hv_&
+                    &PRECISION&
+                    & (a(1:stripe_width,jj+off+a_off-3:jj+off+a_off+nbw-1,istripe,my_thread), &
+                       w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
+#endif
+
+#else /* WITH_OPENMP */
+
+#ifdef USE_ASSUMED_SIZE
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_4hv_&
+                    &PRECISION&
+                    & (a(1,jj+off+a_off-3,istripe), w, &
+                                                  nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_4hv_&
+                    &PRECISION&
+                    & (a(1:stripe_width,jj+off+a_off-3:jj+off+a_off+nbw-1,istripe), &
+                       w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
+#endif
+
+#endif /* WITH_OPENMP */
+             enddo
+             do jjj = jj, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jjj+off)
+               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
+#ifdef WITH_OPENMP
+
+#ifdef USE_ASSUMED_SIZE
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe,my_thread), w(1:nbw,1:6), nbw, &
+                       nl, stripe_width, nbw)
+#endif
+
+#else /* WITH_OPENMP */
+
+#ifdef USE_ASSUMED_SIZE
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1,jjj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_generic_simple_&
+                    &PRECISION&
+                    & (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), nbw, nl, &
+                       stripe_width, nbw)
+#endif
+
+#endif /* WITH_OPENMP */
+             enddo
+#ifdef WITH_OPENMP
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL */
+
+#endif /* REALCASE */
+
+
+#if REALCASE == 1
+! sparc64 block 4 real kernel
+
+#if defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL))
+             do j = ncols, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_openmp_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_SPARC64_BLOCK4_KERNEL */
+
+#endif /* REALCASE */
+
+
+#if REALCASE == 1
+! neon_arch64 block 4 real kernel
+
+#if defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL))
+             do j = ncols, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_openmp_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL */
+
+#endif /* REALCASE */
+
+
+#if REALCASE == 1
+! vsx block4 real kernel
+
+#if defined(WITH_REAL_VSX_BLOCK4_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL))
+             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
+             do j = ncols, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_openmp_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_VSX_BLOCK4_KERNEL */
+
+#endif /* REALCASE */
+
+#if REALCASE == 1
+! sse block4 real kernel
+
+#if defined(WITH_REAL_SSE_BLOCK4_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK4) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
+             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
+             do j = ncols, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_openmp_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+!no sse block4 complex kernel
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+! avx block4 real kernel
+#if defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK4) .or.  &
+               (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK4)) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
+             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
+             do j = ncols, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)),w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_openmp_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_AVX_BLOCK4_KERNEL || WITH_REAL_AVX2_BLOCK4_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+!no avx block4 complex kernel
+#endif
+
+#if REALCASE == 1
+! avx512 block4 real kernel
+
+#if defined(WITH_REAL_AVX512_BLOCK4_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4) then
+#endif /* not WITH_FIXED_REAL_KERNEL */
+
+#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL))
+             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
+             do j = ncols, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_openmp_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jj==1) call single_hh_trafo_&
+                  &MATH_DATATYPE&
+                  &_cpu_&
+                  &PRECISION&
+                  & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
+                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+
+#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */
+
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+!no avx512 block4 complex kernel
+#endif /* COMPLEXCASE */
+
+
+#if REALCASE == 1
+!sparc64 block6 real kernel
+#if defined(WITH_REAL_SPARC64_BLOCK6_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
+             do j = ncols, 6, -6
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+               w(:,5) = bcast_buffer(1:nbw,j+off-4)
+               w(:,6) = bcast_buffer(1:nbw,j+off-5)
+#ifdef WITH_OPENMP
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
+               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
+                                                  nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jjj = jj, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jjj+off)
+               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sparc64_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_SPARC64_BLOCK6_KERNEL */
+
+#endif /* REALCASE */
+
+#if REALCASE == 1
+!neon_arch64 block6 real kernel
+#if defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
+             do j = ncols, 6, -6
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+               w(:,5) = bcast_buffer(1:nbw,j+off-4)
+               w(:,6) = bcast_buffer(1:nbw,j+off-5)
+#ifdef WITH_OPENMP
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
+               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
+                                                  nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jjj = jj, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jjj+off)
+               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_neon_arch64_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL */
+
+#endif /* REALCASE */
+
+
+#if REALCASE == 1
+!vsx block6 real kernel
+#if defined(WITH_REAL_VSX_BLOCK6_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
+             do j = ncols, 6, -6
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+               w(:,5) = bcast_buffer(1:nbw,j+off-4)
+               w(:,6) = bcast_buffer(1:nbw,j+off-5)
+#ifdef WITH_OPENMP
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
+               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
+                                                  nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jjj = jj, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jjj+off)
+               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_vsx_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_VSX_BLOCK6_KERNEL */
+
+#endif /* REALCASE */
+
+#if REALCASE == 1
+!sse block6 real kernel
+#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK6) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
+             do j = ncols, 6, -6
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+               w(:,5) = bcast_buffer(1:nbw,j+off-4)
+               w(:,6) = bcast_buffer(1:nbw,j+off-5)
+#ifdef WITH_OPENMP
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
+               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe)), w, &
+                                                  nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jjj = jj, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jjj+off)
+               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_sse_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_SSE_BLOCK6_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+! no sse block6 complex kernel
+#endif
+
+#if REALCASE == 1
+! avx block6 real kernel
+
+#if defined(WITH_REAL_AVX_BLOCK6_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK6) .or. &
+               (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK6)) then
+
+#endif /* not WITH_FIXED_REAL_KERNEL */
+             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
+             do j = ncols, 6, -6
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+               w(:,5) = bcast_buffer(1:nbw,j+off-4)
+               w(:,6) = bcast_buffer(1:nbw,j+off-5)
+#ifdef WITH_OPENMP
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
+               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jjj = jj, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jjj+off)
+               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx_avx2_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
+                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_AVX_BLOCK6_KERNEL || WITH_REAL_AVX2_BLOCK6_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+!no avx block6 complex kernel
+#endif
+
+#if REALCASE == 1
+! avx512 block6 kernel
+#if defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
+#ifndef WITH_FIXED_REAL_KERNEL
+           if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6)) then
+#endif /* not WITH_FIXED_REAL_KERNEL */
+             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
+
+             do j = ncols, 6, -6
+               w(:,1) = bcast_buffer(1:nbw,j+off)
+               w(:,2) = bcast_buffer(1:nbw,j+off-1)
+               w(:,3) = bcast_buffer(1:nbw,j+off-2)
+               w(:,4) = bcast_buffer(1:nbw,j+off-3)
+               w(:,5) = bcast_buffer(1:nbw,j+off-4)
+               w(:,6) = bcast_buffer(1:nbw,j+off-5)
+#ifdef WITH_OPENMP
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call hexa_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_6hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jj = j, 4, -4
+               w(:,1) = bcast_buffer(1:nbw,jj+off)
+               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
+               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
+               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
+#ifdef WITH_OPENMP
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call quad_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_4hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jj+off+a_off-3,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+             do jjj = jj, 2, -2
+               w(:,1) = bcast_buffer(1:nbw,jjj+off)
+               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
+#ifdef WITH_OPENMP
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
+#else
+               call double_hh_trafo_&
+                    &MATH_DATATYPE&
+                    &_avx512_2hv_&
+                    &PRECISION&
+                    & (c_loc(a(1,jjj+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
+#endif
+             enddo
+#ifdef WITH_OPENMP
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
+                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#else
+             if (jjj==1) call single_hh_trafo_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
+#endif
+#ifndef WITH_FIXED_REAL_KERNEL
+           endif
+#endif /* not WITH_FIXED_REAL_KERNEL */
+#endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */
+
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+!no avx512 block6 complex kernel
+#endif /* COMPLEXCASE */
+
+           if (wantDebug) then
+             call obj%timer%stop("compute_hh_trafo: CPU")
+           endif
+         endif ! GPU_KERNEL
+
+#ifdef WITH_OPENMP
+         if (my_thread==1) then
+#endif
+           kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
+           kernel_time = kernel_time + mpi_wtime()-ttt
+     n_times = n_times + 1
+#ifdef WITH_OPENMP
+         endif
+#endif
+
+         if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
+         &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+         &_openmp" // &
+#else
+         &" // &
+#endif
+         &PRECISION_SUFFIX &
+         )
+
+       end subroutine
+
+! vim: syntax=fortran
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_bandred_template.F90 elpa-2019.11.001/src/elpa2/elpa2_bandred_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_bandred_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_bandred_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1839 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+
+
+! ELPA2 -- 2-stage solver for ELPA
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+    subroutine bandred_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION &
+    (obj, na, a_mat, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
+     tmat_dev, wantDebug, useGPU, success, &
+#if REALCASE == 1
+     useQR, &
+#endif
+     max_threads)
+
+  !-------------------------------------------------------------------------------
+  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
+  !
+  !  Parameters
+  !
+  !  na          Order of matrix
+  !
+  !  a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
+  !              Distribution is like in Scalapack.
+  !              Opposed to Scalapack, a_mat(:,:) must be set completely (upper and lower half)
+  !              a_mat(:,:) is overwritten on exit with the band and the Householder vectors
+  !              in the upper half.
+  !
+  !  lda         Leading dimension of a_mat
+  !  matrixCols  local columns of matrix a_mat
+  !
+  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
+  !
+  !  nbw         semi bandwith of output matrix
+  !
+  !  mpi_comm_rows
+  !  mpi_comm_cols
+  !              MPI-Communicators for rows/columns
+  !
+  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
+  !              Factors for the Householder vectors (returned), needed for back transformation
+  !
+  !-------------------------------------------------------------------------------
+
+      use cuda_functions
+      use iso_c_binding
+      use elpa1_compute
+#ifdef WITH_OPENMP
+      use omp_lib
+#endif
+      use precision
+      use elpa_blas_interfaces
+#ifdef WITH_MPI
+      use elpa_scalapack_interfaces
+#endif
+      use elpa_abstract_impl
+
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck)                     :: a_mat(lda,*)
+      MATH_DATATYPE(kind=rck)                     :: tmat(nbw,nbw,*)
+#else
+      MATH_DATATYPE(kind=rck)                     :: a_mat(lda,matrixCols)
+      MATH_DATATYPE(kind=rck)                     :: tmat(nbw,nbw,numBlocks)
+#endif
+
+#if REALCASE == 1
+      real(kind=rk)                               :: eps
+#endif
+      logical, intent(in)                         :: useGPU
+      integer(kind=c_int)                         :: skewsymmetric
+      logical                                     :: isSkewsymmetric
+      character(20)                               :: gpuString
+
+      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols
+      integer(kind=MPI_KIND)                      :: mpierr,  my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+      integer(kind=ik)                            :: l_cols, l_rows
+#if REALCASE == 1
+      integer(kind=ik)                            :: vmrCols
+#endif
+#ifdef WITH_OPENMP
+      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
+#endif
+      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
+      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
+      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
+
+      real(kind=rk)                              :: vnorm2
+      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau
+      MATH_DATATYPE(kind=rck)                    :: vav(nbw,nbw)
+
+!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
+      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
+      MATH_DATATYPE(kind=rck), allocatable :: vmrCUDA(:), umcCUDA(:)
+      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
+      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
+
+#if REALCASE == 1
+      ! needed for blocked QR decomposition
+      integer(kind=ik)                            :: PQRPARAM(11), work_size
+      real(kind=rk)                    :: dwork_size(1)
+      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
+#endif
+      ! a_dev is passed from bandred_real to trans_ev_band
+      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
+      integer(kind=ik)                            :: ierr
+      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
+      integer(kind=c_intptr_t)                    :: lc_start, lc_end
+#if COMPLEXCASE == 1
+      integer(kind=c_intptr_t)                    :: lce_1, lcs_1, lre_1
+#endif
+      integer(kind=ik)                            :: lr_end
+      integer(kind=ik)                            :: na_cols
+      integer(kind=BLAS_KIND)                     :: na_colsBLAS
+#if COMPLEXCASE == 1
+      integer(kind=ik)                            :: na_rows
+      integer(kind=BLAS_KIND)                     :: na_rowsBLAS
+#endif
+
+      logical, intent(in)                         :: wantDebug
+      logical, intent(out)                        :: success
+      logical                                     :: successCUDA
+      integer(kind=ik)                            :: istat
+      character(200)                              :: errorMessage
+      integer(kind=ik)                            :: min_tile_size, error
+
+#if REALCASE == 1
+      logical, intent(in)                         :: useQR
+#endif
+      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
+                                                    ii, pp
+      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
+                                                                        &PRECISION&
+                                                                        &_&
+                                                                        &MATH_DATATYPE
+
+      logical                                     :: useGPU_reduction_lower_block_to_tridiagonal
+      integer(kind=ik), intent(in)                :: max_threads
+
+      call obj%get("is_skewsymmetric",skewsymmetric,error)
+      if (error .ne. ELPA_OK) then
+           print *,"Problem getting option. Aborting..."
+           stop
+      endif
+      isSkewsymmetric = (skewsymmetric == 1)
+
+      if(useGPU) then
+        gpuString = "_gpu"
+      else
+        gpuString = ""
+      endif
+
+      call obj%timer%start("bandred_&
+      &MATH_DATATYPE&
+      &" // &
+      PRECISION_SUFFIX // &
+      gpuString )
+
+      useGPU_reduction_lower_block_to_tridiagonal = .false.
+
+      if (useGPU) then
+        useGPU_reduction_lower_block_to_tridiagonal = .true.
+#if REALCASE == 1
+        if (useQR) then
+          !in this case switch off GPU usage for step "reduce current block to lower triangular form"
+          ! since this is done by QR decomposition
+          useGPU_reduction_lower_block_to_tridiagonal = .false.
+        endif
+#endif
+      endif
+
+      if (wantDebug) call obj%timer%start("mpi_communication")
+
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
+
+      my_prow = int(my_prowMPI,kind=c_int)
+      np_rows = int(np_rowsMPI,kind=c_int)
+      my_pcol = int(my_pcolMPI,kind=c_int)
+      np_cols = int(np_colsMPI,kind=c_int)
+
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+      success = .true.
+
+
+      ! Semibandwith nbw must be a multiple of blocksize nblk
+      if (mod(nbw,nblk)/=0) then
+        if (my_prow==0 .and. my_pcol==0) then
+          if (wantDebug) then
+            write(error_unit,*) 'ELPA2_bandred_&
+                                 &MATH_DATATYPE&
+                                 &: ERROR: nbw=',nbw,', nblk=',nblk
+            write(error_unit,*) 'ELPA2_bandred_&
+                                 &MATH_DATATYPE&
+                                 &: ELPA2 works only for nbw==n*nblk'
+          endif
+          success = .false.
+          return
+        endif
+      endif
+
+      ! na_rows in used nowhere; only na_cols
+      if (useGPU) then
+#ifdef WITH_MPI
+#if COMPLEXCASE == 1
+        na_rowsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
+                         int(my_prow,kind=BLAS_KIND), 0_BLAS_KIND, int(np_rows,kind=BLAS_KIND))
+        na_rows = int(na_rowsBLAS,kind=c_int)
+#endif
+        na_colsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
+                         int(my_pcol,kind=BLAS_KIND), 0_BLAS_KIND, int(np_cols,kind=BLAS_KIND))
+        na_cols = int(na_colsBLAS,kind=c_int)
+#else
+#if COMPLEXCASE == 1
+         na_rows = na
+#endif
+        na_cols = na
+#endif /* WITH_MPI */
+
+        ! Here we convert the regular host array into a pinned host array
+        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
+        if (.not.(successCUDA)) then
+          print *,"bandred_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMalloc a_dev 1"
+          stop 1
+        endif
+
+        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
+        if (.not.(successCUDA)) then
+          print *,"bandred_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMalloc vav_dev 1"
+          stop 1
+        endif
+      endif ! useGPU
+
+      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
+
+      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
+
+      ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
+      ! larger or equal to min_tile_size
+      ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
+      ! it can, however, be set by the user
+      call obj%get("min_tile_size", min_tile_size ,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem setting option. Aborting..."
+        stop
+      endif
+      if(min_tile_size == 0) then
+        ! not set by the user, use the default value
+        min_tile_size = 128*max(np_rows, np_cols)
+      endif
+      tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
+
+      l_rows_tile = tile_size/np_rows ! local rows of a tile
+      l_cols_tile = tile_size/np_cols ! local cols of a tile
+
+#if REALCASE == 1
+      if (useQR) then
+
+        if (which_qr_decomposition == 1) then
+          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
+          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"bandred_real: error when allocating tauvector "//errorMessage
+            stop 1
+          endif
+
+          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
+            stop 1
+          endif
+
+          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
+          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
+            stop 1
+          endif
+
+          vmrCols = na
+
+#ifdef USE_ASSUMED_SIZE_QR
+          call qr_pdgeqrf_2dcomm_&
+               &PRECISION&
+               &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
+                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
+                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
+
+#else
+          call qr_pdgeqrf_2dcomm_&
+               &PRECISION&
+               &(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
+                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
+                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
+                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
+#endif
+
+          work_size = int(dwork_size(1))
+          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"bandred_real: error when allocating work_blocked "//errorMessage
+            stop 1
+          endif
+          work_blocked = 0.0_rk
+          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
+            stop 1
+          endif
+
+        endif ! which_qr_decomposition
+
+      endif ! useQr
+#endif /* REALCASE */
+
+      if (useGPU) then
+
+        cur_l_rows = 0
+        cur_l_cols = 0
+
+        successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
+                      (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
+        if (.not.(successCUDA)) then
+          print *,"bandred_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMemcpy a_dev 2"
+          stop 1
+        endif
+      endif ! useGPU
+
+
+      do istep = (na-1)/nbw, 1, -1
+
+        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
+
+        ! Number of local columns/rows of remaining matrix
+        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
+        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
+
+        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
+
+        if (useGPU) then
+          cur_l_rows = max(l_rows, 1)
+          cur_l_cols = max(l_cols, 1)
+
+          vmr_size = cur_l_rows * 2 * n_cols
+          umc_size = cur_l_cols * 2 * n_cols
+
+          ! Allocate vmr and umc only if the inew size exceeds their current capacity
+          ! Added for FORTRAN CALLS
+          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
+            if (allocated(vr)) then
+              deallocate(vr, stat=istat, errmsg=errorMessage)
+              if (istat .ne. 0) then
+                print *,"bandred_&
+                        &MATH_DATATYPE&
+                        &: error when deallocating vr "//errorMessage
+                stop 1
+              endif
+            endif
+            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
+            if (istat .ne. 0) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error when allocating vr "//errorMessage
+              stop 1
+            endif
+
+          endif
+
+          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
+            if (allocated(vmrCUDA)) then
+              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
+              if (istat .ne. 0) then
+                print *,"bandred_&
+                        &MATH_DATATYPE&
+                        &: error when allocating vmrCUDA "//errorMessage
+                stop 1
+              endif
+
+              successCUDA = cuda_free(vmr_dev)
+              if (.not.(successCUDA)) then
+                print *,"bandred_&
+                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
+                stop 1
+              endif
+            endif
+
+            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
+
+            if (istat .ne. 0) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error when allocating vmrCUDA "//errorMessage
+              stop 1
+            endif
+            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
+            if (.not.(successCUDA)) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error in cudaMalloc: vmr_dev2"
+              stop 1
+            endif
+
+          endif
+
+          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
+            if (allocated(umcCUDA)) then
+              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
+              if (istat .ne. 0) then
+                print *,"bandred_&
+                        &MATH_DATATYPE&
+                        &: error when deallocating umcCUDA "//errorMessage
+                stop 1
+              endif
+
+              successCUDA = cuda_free(umc_dev)
+              if (.not.(successCUDA)) then
+                 print *,"bandred_&
+                         &MATH_DATATYPE&
+                         &: error in cudaFree umc_dev 1"
+                 stop 1
+              endif
+
+            endif
+
+            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
+
+            if (istat .ne. 0) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error when deallocating umcCUDA "//errorMessage
+              stop 1
+            endif
+
+            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
+            if (.not.(successCUDA)) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error in cudaMalloc umc_dev 2"
+              stop 1
+            endif
+
+          endif
+
+        else ! GPU not used
+
+          ! unify the the name vmr and vmrCPU, as well as vmrGPU
+          ! the same for umcCPU and umcGPU
+          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
+
+          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"bandred_&
+                     &MATH_DATATYPE&
+                     &: error when allocating vmrCPU "//errorMessage
+            stop 1
+          endif
+
+          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"bandred_&
+                    &MATH_DATATYPE&
+                    &: error when allocating umcCPU "//errorMessage
+            stop 1
+          endif
+
+          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
+          if (istat .ne. 0) then
+            print *,"bandred_&
+                    &MATH_DATATYPE&
+                    &: error when allocating vr "//errorMessage
+            stop 1
+          endif
+
+        endif ! use GPU
+
+        if (useGPU) then
+          vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
+        else
+          vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
+        endif ! useGPU
+
+        vr(:) = 0.0_rck
+        tmat(:,:,istep) = 0.0_rck
+        if (useGPU) then
+#if REALCASE == 1
+          umcCUDA(1 : umc_size) = 0.0_rck
+#endif
+          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
+          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
+          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
+
+          if (lc_start .le. 0) lc_start = 1
+
+          ! Here we assume that the processor grid and the block grid are aligned
+          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
+
+          if (my_pcol == cur_pcol) then
+            successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), &
+                                      int((lda*size_of_datatype),kind=c_intptr_t), &
+                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
+                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
+                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
+                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
+
+
+            if (.not.(successCUDA)) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error in cudaMemcpy2d"
+              stop 1
+            endif
+          endif
+        endif ! useGPU
+
+        ! Reduce current block to lower triangular form
+#if REALCASE == 1
+        if (useQR) then
+          if (useGPU) then
+ !            vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
+          endif
+
+          if (which_qr_decomposition == 1) then
+            vmrCols = 2*n_cols
+#ifdef USE_ASSUMED_SIZE_QR
+            call qr_pdgeqrf_2dcomm_&
+                 &PRECISION&
+                 &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
+                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
+                                     work_size, na, n_cols, nblk, nblk,        &
+                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
+                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
+                                     blockheuristic)
+
+#else
+            call qr_pdgeqrf_2dcomm_&
+                 &PRECISION&
+                 &(obj, a_mat(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
+                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
+                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
+                                     work_size, na, n_cols, nblk, nblk,        &
+                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
+                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
+                                     blockheuristic)
+#endif
+          endif
+
+       else !useQR
+#endif /* REALCASE == 1 */
+         do lc = n_cols, 1, -1
+
+           ncol = istep*nbw + lc ! absolute column number of householder Vector
+           nrow = ncol - nbw ! Absolute number of pivot row
+
+           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
+           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
+
+           tau = 0
+
+           if (nrow == 1) exit ! Nothing to do
+
+           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
+
+           if (my_pcol==cur_pcol) then
+
+             ! Get Vector to be transformed; distribute last element and norm of
+             ! remaining elements to all procs in current column
+
+             vr(1:lr) = a_mat(1:lr,lch) ! Vector to be transformed
+
+             if (my_prow==prow(nrow, nblk, np_rows)) then
+               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
+               aux1(2) = vr(lr)
+             else
+               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
+               aux1(2) = 0.0_rck
+             endif
+
+#ifdef WITH_MPI
+             if (wantDebug) call obj%timer%start("mpi_communication")
+             call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
+                                MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+             if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+              aux2 = aux1 ! this should be optimized
+#endif
+
+#if REALCASE == 1
+             vnorm2 = aux2(1)
+#endif
+#if COMPLEXCASE == 1
+             vnorm2 = real(aux2(1),kind=rk)
+#endif
+             vrl    = aux2(2)
+
+             ! Householder transformation
+       call hh_transform_&
+             &MATH_DATATYPE&
+             &_&
+             &PRECISION &
+                         (obj, vrl, vnorm2, xf, tau, wantDebug)
+             ! Scale vr and store Householder Vector for back transformation
+
+             vr(1:lr) = vr(1:lr) * xf
+             if (my_prow==prow(nrow, nblk, np_rows)) then
+               a_mat(1:lr-1,lch) = vr(1:lr-1)
+               a_mat(lr,lch) = vrl
+               vr(lr) = 1.0_rck
+             else
+               a_mat(1:lr,lch) = vr(1:lr)
+             endif
+
+           endif
+
+           ! Broadcast Householder Vector and tau along columns
+
+           vr(lr+1) = tau
+#ifdef WITH_MPI
+           if (wantDebug) call obj%timer%start("mpi_communication")
+           call MPI_Bcast(vr, int(lr+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
+                          int(cur_pcol,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+           if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+
+           if (useGPU_reduction_lower_block_to_tridiagonal) then
+             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
+           else
+             vmrCPU(1:lr,lc) = vr(1:lr)
+           endif
+           tau = vr(lr+1)
+
+#if REALCASE == 1
+           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
+#endif
+#if COMPLEXCASE == 1
+           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
+#endif
+           ! Transform remaining columns in current block with Householder Vector
+           ! Local dot product
+
+           aux1 = 0.0_rck
+
+#ifdef WITH_OPENMP
+#if 0
+ ! original complex implementation without openmp. check performance
+            nlc = 0 ! number of local columns
+           do j=1,lc-1
+             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
+             if (lcx>0) then
+               nlc = nlc+1
+               aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
+             endif
+           enddo
+
+           ! Get global dot products
+#ifdef WITH_MPI
+           if (wantDebug) call obj%timer%start("mpi_communication")
+           if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
+                                         int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+
+           ! Transform
+
+           nlc = 0
+           do j=1,lc-1
+             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
+             if (lcx>0) then
+               nlc = nlc+1
+               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
+
+             endif
+           enddo
+
+
+           if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+!          if (nlc>0) aux2=aux1
+
+           ! Transform
+
+           nlc = 0
+           do j=1,lc-1
+             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
+             if (lcx>0) then
+               nlc = nlc+1
+               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
+             endif
+           enddo
+
+#endif /* WITH_MPI */
+!
+!           ! Transform
+!
+!           nlc = 0
+!           do j=1,lc-1
+!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
+!             if (lcx>0) then
+!               nlc = nlc+1
+!               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
+
+!             endif
+!           enddo
+#endif /* if 0 */
+
+           !Open up one omp region to avoid paying openmp overhead.
+           !This does not help performance due to the addition of two openmp barriers around the MPI call,
+           !But in the future this may be beneficial if these barriers are replaced with a faster implementation
+
+           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
+           mynlc = 0 ! number of local columns
+
+           !This loop does not have independent iterations,
+           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
+           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
+           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
+           !the iteration number modulo the number of threads
+           do j=1,lc-1
+             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
+             if (lcx>0 ) then
+               mynlc = mynlc+1
+               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
+                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
+               endif
+             endif
+           enddo
+
+           ! Get global dot products
+
+           !$omp barrier
+           !$omp single
+#ifdef WITH_MPI
+           if (wantDebug) call obj%timer%start("mpi_communication")
+           if (mynlc>0) call mpi_allreduce(aux1, aux2, int(mynlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
+                                           MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+           if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+           if (mynlc>0) aux2 = aux1
+#endif /* WITH_MPI */
+           !$omp end single
+           !$omp barrier
+
+           ! Transform
+           transformChunkSize=32
+           mynlc = 0
+           do j=1,lc-1
+             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
+             if (lcx>0) then
+               mynlc = mynlc+1
+               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
+               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
+               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
+                  do pp = 1,transformChunkSize
+                      if (pp + ii > lr) exit
+#if REALCASE == 1
+                          a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
+#endif
+#if COMPLEXCASE == 1
+                          a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
+#endif
+                  enddo
+               enddo
+             endif
+           enddo
+           !$omp end parallel
+
+#else /* WITH_OPENMP */
+
+           nlc = 0 ! number of local columns
+           do j=1,lc-1
+             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
+             if (lcx>0) then
+               nlc = nlc+1
+               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
+             endif
+           enddo
+
+           ! Get global dot products
+#ifdef WITH_MPI
+           if (wantDebug) call obj%timer%start("mpi_communication")
+           if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
+                                         MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+           if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+           if (nlc>0) aux2=aux1
+#endif /* WITH_MPI */
+           ! Transform
+
+           nlc = 0
+           do j=1,lc-1
+             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
+             if (lcx>0) then
+               nlc = nlc+1
+#if REALCASE == 1
+               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
+#endif
+#if COMPLEXCASE == 1
+               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
+#endif
+             endif
+           enddo
+#endif /* WITH_OPENMP */
+         enddo ! lc
+
+         if (useGPU_reduction_lower_block_to_tridiagonal) then
+           ! store column tiles back to GPU
+           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
+           if (my_pcol == cur_pcol) then
+             successCUDA = cuda_memcpy2d((a_dev+        &
+                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
+                                         int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
+                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
+                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
+                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
+                                         int(cudaMemcpyHostToDevice,kind=c_int))
+
+             if (.not.(successCUDA)) then
+               print *, "bandred_&
+                        &MATH_DATATYPE&
+                        &: cuda memcpy a_dev  failed ", istat
+               stop 1
+             endif
+           endif
+         endif
+
+         ! Calculate scalar products of stored Householder vectors.
+         ! This can be done in different ways, we use dsyrk
+
+         vav = 0
+         call obj%timer%start("blas")
+         if (useGPU_reduction_lower_block_to_tridiagonal) then
+           if (l_rows>0) &
+#if REALCASE == 1
+             call PRECISION_SYRK('U', 'T',            &
+#endif
+#if COMPLEXCASE == 1
+             call PRECISION_HERK('U', 'C',            &
+#endif
+                           int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, &
+                           vmrCUDA, int(cur_l_rows,kind=BLAS_KIND), &
+                           ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND))
+
+         else ! useGPU_reduction_to_tridiagonal
+           if (l_rows>0) &
+#if REALCASE == 1
+             call PRECISION_SYRK('U', 'T',           &
+#endif
+#if COMPLEXCASE == 1
+             call PRECISION_HERK('U', 'C',           &
+#endif
+                                 int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, vmrCPU, &
+                                 int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND))
+         endif
+         call obj%timer%stop("blas")
+#if REALCASE == 1
+         call symm_matrix_allreduce_&
+#endif
+#if COMPLEXCASE == 1
+         call herm_matrix_allreduce_&
+#endif
+         &PRECISION &
+                         (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
+         ! Calculate triangular matrix T for block Householder Transformation
+         call obj%timer%start("blas")
+         do lc=n_cols,1,-1
+           tau = tmat(lc,lc,istep)
+           if (lc<n_cols) then
+             call PRECISION_TRMV('U', BLAS_TRANS_OR_CONJ, 'N',&
+                                 int(n_cols-lc,kind=BLAS_KIND), tmat(lc+1,lc+1,istep), &
+                                 int(ubound(tmat,dim=1),kind=BLAS_KIND), vav(lc+1,lc), 1_BLAS_KIND)
+
+#if REALCASE == 1
+             tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
+#endif
+#if COMPLEXCASE == 1
+             tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
+#endif
+           endif
+         enddo
+         call obj%timer%stop("blas")
+#if REALCASE == 1
+       endif !useQR
+#endif
+
+#if REALCASE == 1
+       if (useGPU .and. useQR) then
+         ! copy the data for furhter usage
+         ! qr worked on *CPU arrarys
+         !vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
+         cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
+         if (my_pcol == cur_pcol) then
+           successCUDA = cuda_memcpy2d((a_dev+        &
+                                       int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
+                                       int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
+                                       int(lda*size_of_datatype,kind=c_intptr_t),           &
+                                       int(lr_end*size_of_datatype,kind=c_intptr_t),        &
+                                       int((lc_end - lc_start+1),kind=c_intptr_t), &
+                                       int(cudaMemcpyHostToDevice,kind=c_int))
+
+           if (.not.(successCUDA)) then
+             print *, "bandred_&
+                      &MATH_DATATYPE&
+                      &: cuda memcpy a_dev  failed ", istat
+             stop 1
+           endif
+         endif
+
+       endif
+#endif
+
+       ! Transpose vmr -> vmc (stored in umc, second half)
+       if (useGPU) then
+         call elpa_transpose_vectors_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION &
+                           (obj, vmrCUDA, cur_l_rows, mpi_comm_rows, &
+                            umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, &
+                            mpi_comm_cols, 1, istep*nbw, n_cols, nblk, max_threads)
+       else ! useGPU
+         call elpa_transpose_vectors_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION &
+                                           (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
+                                            umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
+                                            1, istep*nbw, n_cols, nblk, max_threads)
+       endif
+
+       ! Calculate umc = A**T * vmr
+       ! Note that the distributed A has to be transposed
+       ! Opposed to direct tridiagonalization there is no need to use the cache locality
+       ! of the tiles, so we can use strips of the matrix
+
+
+#if 0
+       ! original complex implemetation check for performance
+       umcCPU(1:l_cols,1:n_cols) = 0.0_rck
+       vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
+
+       if (l_cols>0 .and. l_rows>0) then
+         do i=0,(istep*nbw-1)/tile_size
+
+           lcs = i*l_cols_tile+1
+           lce = min(l_cols,(i+1)*l_cols_tile)
+           if (lce<lcs) cycle
+
+           lre = min(l_rows,(i+1)*l_rows_tile)
+
+             call obj%timer%start("blas")
+             call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
+                        vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
+             call obj%timer%stop("blas")
+
+           if (i==0) cycle
+           lre = min(l_rows,i*l_rows_tile)
+             call obj%timer%start("blas")
+             call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
+                        umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
+             call obj%timer%stop("blas")
+         enddo
+
+       endif ! (l_cols>0 .and. l_rows>0)
+#endif /* if 0 */
+
+       !Code for Algorithm 4
+
+       ! n_way is actually a branch for the number of OpenMP threads
+       n_way = 1
+#ifdef WITH_OPENMP
+
+#if REALCASE == 1
+       n_way = max_threads
+
+       !$omp parallel private( i,lcs,lce,lrs,lre)
+#endif
+       if (n_way > 1) then
+#if REALCASE == 1
+         !$omp do
+#endif
+         do i=1,min(l_cols_tile, l_cols)
+           umcCPU(i,1:n_cols) = 0.0_rck
+         enddo
+
+#if REALCASE == 1
+         !$omp do
+#endif
+         do i=1,l_rows
+           vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck
+         enddo
+
+         if (l_cols>0 .and. l_rows>0) then
+
+           !SYMM variant 4
+           !Partitioned Matrix Expression:
+           ! Ct = Atl Bt + Atr Bb
+           ! Cb = Atr' Bt + Abl Bb
+           !
+           !Loop invariant:
+           ! Ct = Atl Bt + Atr Bb
+           !
+           !Update:
+           ! C1 = A10'B0 + A11B1 + A21 B2
+           !
+           !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
+           !is easily parallelized, and regardless of choise of algorithm,
+           !the startup cost for parallelizing the dgemms inside the loop is too great
+#if REALCASE == 1
+           !$omp do schedule(static,1)
+#endif
+           do i=0,(istep*nbw-1)/tile_size
+             lcs = i*l_cols_tile+1                   ! local column start
+             lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end
+
+             lrs = i*l_rows_tile+1                   ! local row start
+             lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end
+
+             !C1 += [A11 A12] [B1
+             !                 B2]
+             if ( lre > lrs .and. l_cols > lcs ) then
+               call obj%timer%start("blas")
+               if (isSkewsymmetric) then
+                 call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
+                                     int(l_cols-lcs+1,kind=BLAS_KIND),                                    &
+                                     -ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND),       &
+                                     umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND),      &
+                                     ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) )
+               else
+                 call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
+                                     int(l_cols-lcs+1,kind=BLAS_KIND),                                    &
+                                     ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND),        &
+                                     umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND),      &
+                                     ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) )
+
+               endif
+               call obj%timer%stop("blas")
+             endif
+
+             ! C1 += A10' B0
+             if ( lce > lcs .and. i > 0 ) then
+               call obj%timer%start("blas")
+               call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',     &
+                                   int(lce-lcs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lrs-1,kind=BLAS_KIND), &
+                                    ONE, a_mat(1,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND),      &
+                                    vmrCPU(1,1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND),   &
+                                    ZERO, umcCPU(lcs,1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND) )
+               call obj%timer%stop("blas")
+             endif
+           enddo
+         endif ! l_cols>0 .and. l_rows>0
+
+      else ! n_way > 1
+#endif /* WITH_OPENMP */
+
+        if (useGPU) then
+          umcCUDA(1 : l_cols * n_cols) = 0.0_rck
+          vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = 0.0_rck
+        else ! useGPU
+          umcCPU(1:l_cols,1:n_cols) = 0.0_rck
+          vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
+        endif ! useGPU
+
+        if (l_cols>0 .and. l_rows>0) then
+
+          if (useGPU) then
+            successCUDA = cuda_memcpy(vmr_dev,        &
+                                       int(loc(vmrCUDA(1)),kind=c_intptr_t),&
+                                       vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
+            if (.not.(successCUDA)) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error in cudaMemcpy vmr_dev 3"
+              stop 1
+            endif
+
+            successCUDA = cuda_memcpy(umc_dev,    &
+                                      int(loc(umcCUDA(1)),kind=c_intptr_t), &
+                                      umc_size*size_of_datatype,cudaMemcpyHostToDevice)
+            if (.not.(successCUDA)) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error in cudaMemcpy umc_dev 3"
+              stop 1
+            endif
+          endif ! useGPU
+
+          do i=0,(istep*nbw-1)/tile_size
+
+            lcs = i*l_cols_tile+1
+            lce = min(l_cols,(i+1)*l_cols_tile)
+            if (lce<lcs) cycle
+            lre = min(l_rows,(i+1)*l_rows_tile)
+
+            if (useGPU) then
+              call obj%timer%start("cublas")
+              call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',                   &
+                                         lce-lcs+1, n_cols, lre,     &
+                                         ONE, (a_dev + ((lcs-1)*lda* &
+                                         size_of_datatype)),         &
+                                         lda, vmr_dev,cur_l_rows,    &
+                                         ONE, (umc_dev+ (lcs-1)*     &
+                                             size_of_datatype),      &
+                                         cur_l_cols)
+
+              call obj%timer%stop("cublas")
+
+              if(i==0) cycle
+              call obj%timer%start("cublas")
+
+              lre = min(l_rows,i*l_rows_tile)
+              if (isSkewsymmetric) then
+                call cublas_PRECISION_GEMM('N', 'N', lre,n_cols, lce-lcs+1, -ONE, &
+                              (a_dev+ ((lcs-1)*lda*                 &
+                                    size_of_datatype)),             &
+                         lda, (umc_dev+(cur_l_cols * n_cols+lcs-1)* &
+                                size_of_datatype),              &
+                                cur_l_cols, ONE, (vmr_dev+(cur_l_rows * n_cols)* &
+                              size_of_datatype),              &
+                                cur_l_rows)
+              else
+                call cublas_PRECISION_GEMM('N', 'N', lre,n_cols, lce-lcs+1, ONE, &
+                                            (a_dev+ ((lcs-1)*lda*                 &
+                                                  size_of_datatype)),             &
+                                       lda, (umc_dev+(cur_l_cols * n_cols+lcs-1)* &
+                                              size_of_datatype),              &
+                                              cur_l_cols, ONE, (vmr_dev+(cur_l_rows * n_cols)* &
+                                            size_of_datatype),              &
+                                              cur_l_rows)
+              endif
+              call obj%timer%stop("cublas")
+            else ! useGPU
+
+              call obj%timer%start("blas")
+              call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',       &
+                                  int(lce-lcs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lre,kind=BLAS_KIND), &
+                                  ONE, a_mat(1,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND), &
+                                  vmrCPU, int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), ONE, umcCPU(lcs,1), &
+                                  int(ubound(umcCPU,dim=1),kind=BLAS_KIND) )
+              call obj%timer%stop("blas")
+              if (i==0) cycle
+              lre = min(l_rows,i*l_rows_tile)
+              call obj%timer%start("blas")
+
+              if (isSkewsymmetric) then
+                call PRECISION_GEMM('N', 'N', int(lre,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lce-lcs+1,kind=BLAS_KIND), &
+                                    -ONE, a_mat(1,lcs), int(lda,kind=BLAS_KIND),                                                   &
+                                    umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), ONE,                          &
+                                    vmrCPU(1,n_cols+1), int(ubound(vmrCPU,dim=1), kind=BLAS_KIND) )
+
+              else
+                call PRECISION_GEMM('N', 'N', int(lre,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lce-lcs+1,kind=BLAS_KIND), &
+                                    ONE, a_mat(1,lcs), int(lda,kind=BLAS_KIND),                                                   &
+                                    umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), ONE,                          &
+                                    vmrCPU(1,n_cols+1), int(ubound(vmrCPU,dim=1), kind=BLAS_KIND) )
+              endif
+              call obj%timer%stop("blas")
+            endif ! useGPU
+          enddo ! i=0,(istep*nbw-1)/tile_size
+
+          if (useGPU) then
+            successCUDA = cuda_memcpy(     &
+                                  int(loc(vmrCUDA(1)),kind=c_intptr_t),    &
+                                   vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
+            if (.not.(successCUDA)) then
+              print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error in cudaMemcpy vmr_dev 4"
+              stop 1
+            endif
+
+            successCUDA = cuda_memcpy(    &
+                                int(loc(umcCUDA(1)),kind=c_intptr_t),    &
+                                umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
+            if (.not.(successCUDA)) then
+              print *,"bandred_&
+              &MATH_DATATYPE&
+              &: error in cudaMemcpy umc_dev 4"
+              stop 1
+            endif
+          endif ! useGPU
+        endif ! l_cols>0 .and. l_rows>0
+
+#ifdef WITH_OPENMP
+      endif ! n_way > 1
+#if REALCASE == 1
+      !$omp end parallel
+#endif
+#endif
+       ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
+       ! on the processors containing the diagonal
+       ! This is only necessary if ur has been calculated, i.e. if the
+       ! global tile size is smaller than the global remaining matrix
+
+       ! Or if we used the Algorithm 4
+       if (tile_size < istep*nbw .or. n_way > 1) then
+
+         if (useGPU) then
+
+           call elpa_reduce_add_vectors_&
+                &MATH_DATATYPE&
+                &_&
+                &PRECISION &
+                                (obj, vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,  &
+                                 mpi_comm_rows, umcCUDA,                            &
+                                 cur_l_cols, mpi_comm_cols, istep*nbw, n_cols, nblk, max_threads)
+         else ! useGPU
+
+           call elpa_reduce_add_vectors_&
+           &MATH_DATATYPE&
+           &_&
+           &PRECISION &
+                                            (obj, vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
+                                             umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
+                                             istep*nbw, n_cols, nblk, max_threads)
+         endif ! useGPU
+       endif ! tile_size < istep*nbw .or. n_way > 1
+
+       if (l_cols>0) then
+
+         if (useGPU) then
+#ifdef WITH_MPI
+           allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
+           if (istat .ne. 0) then
+             print *,"bandred_&
+                      &MATH_DATATYPE&
+                      &: error when allocating tmpCUDA "//errorMessage
+             stop 1
+           endif
+
+           if (wantDebug) call obj%timer%start("mpi_communication")
+
+           call mpi_allreduce(umcCUDA, tmpCUDA, int(l_cols*n_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
+                              MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), ierr)
+
+           umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
+           if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+
+           ! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
+
+#endif /* WITH_MPI */
+
+           if (allocated(tmpCUDA)) then
+             deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
+             if (istat .ne. 0) then
+               print *,"bandred_&
+                       &MATH_DATATYPE&
+                       &: error when deallocating tmpCUDA "//errorMessage
+               stop 1
+             endif
+           endif
+
+         else ! useGPU
+
+           allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage)
+           if (istat .ne. 0) then
+             print *,"bandred_&
+                     &MATH_DATATYPE&
+                     &: error when allocating tmpCPU "//errorMessage
+             stop 1
+           endif
+
+#ifdef WITH_MPI
+           if (wantDebug) call obj%timer%start("mpi_communication")
+           call mpi_allreduce(umcCPU, tmpCPU, int(l_cols*n_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
+                              MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+           umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
+           if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+!           tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
+#endif /* WITH_MPI */
+
+           deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
+           if (istat .ne. 0) then
+             print *,"bandred_&
+                     &MATH_DATATYPE&
+                     &: error when deallocating tmpCPU "//errorMessage
+             stop 1
+           endif
+         endif ! useGPU
+       endif ! l_cols > 0
+
+       ! U = U * Tmat**T
+
+       if (useGPU) then
+         successCUDA = cuda_memcpy(umc_dev,    &
+                                   int(loc(umcCUDA(1)),kind=c_intptr_t),   &
+                                   umc_size*size_of_datatype, cudaMemcpyHostToDevice)
+         if (.not.(successCUDA)) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error in cudaMemcpy umc_dev 5"
+           stop 1
+         endif
+         successCUDA = cuda_memcpy(tmat_dev,int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
+                       nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
+         if (.not.(successCUDA)) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error in cudaMemcpy tmat_dev 2"
+           stop 1
+         endif
+
+         call obj%timer%start("cublas")
+         call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit',  &
+                               l_cols, n_cols, ONE, tmat_dev, nbw, umc_dev, cur_l_cols)
+         call obj%timer%stop("cublas")
+
+         ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
+         successCUDA = cuda_memcpy(vav_dev,int(loc(vav(1,1)),kind=c_intptr_t), &
+                       nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
+         if (.not.(successCUDA)) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error in cudaMemcpy vav_dev 2"
+           stop 1
+         endif
+         call obj%timer%start("cublas")
+
+         call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',             &
+                                    n_cols, n_cols, l_cols, ONE, umc_dev, cur_l_cols, &
+                                    (umc_dev+(cur_l_cols * n_cols )*size_of_datatype),cur_l_cols, &
+                                    ZERO, vav_dev, nbw)
+
+         call cublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit',    &
+                            n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
+         call obj%timer%stop("cublas")
+
+         successCUDA = cuda_memcpy(int(loc(vav(1,1)),kind=c_intptr_t), &
+                       vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
+         if (.not.(successCUDA)) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error in cudaMemcpy vav_dev3"
+           stop 1
+         endif
+       else ! useGPU
+
+         call obj%timer%start("blas")
+
+         call PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit',     &
+                             int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), ONE, tmat(1,1,istep), &
+                             int(ubound(tmat,dim=1),kind=BLAS_KIND), &
+                              umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND))
+
+         ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
+
+         call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',              &
+                             int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
+                             ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND), umcCPU(1,n_cols+1), &
+                             int(ubound(umcCPU,dim=1),kind=BLAs_KIND), ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND))
+
+         call PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit',    &
+                             int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), ONE, tmat(1,1,istep),    &
+                             int(ubound(tmat,dim=1),kind=BLAS_KIND), vav, int(ubound(vav,dim=1),kind=BLAS_KIND) )
+         call obj%timer%stop("blas")
+
+       endif ! useGPU
+
+#if REALCASE == 1
+#ifdef HAVE_SKEWSYMMETRIC
+       if (isSkewsymmetric) then
+         call ssymm_matrix_allreduce_&
+         &PRECISION &
+                              (obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
+       else
+#endif
+         call symm_matrix_allreduce_&
+         &PRECISION &
+                              (obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
+#ifdef HAVE_SKEWSYMMETRIC
+       endif
+#endif
+#endif /* REALCASE */
+#if COMPLEXCASE == 1
+       call herm_matrix_allreduce_&
+            &PRECISION &
+                              (obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
+#endif
+
+       if (useGPU) then
+         successCUDA = cuda_memcpy(vav_dev, int(loc(vav(1,1)),kind=c_intptr_t), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
+         if (.not.(successCUDA)) then
+           print *,"bandred_&
+           &MATH_DATATYPE&
+           &: error in cudaMemcpy vav_dev4"
+           stop 1
+         endif
+       endif
+
+       ! U = U - 0.5 * V * VAV
+
+       if (useGPU) then
+         call obj%timer%start("cublas")
+         if (isSkewsymmetric) then
+           call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
+#if REALCASE == 1
+                                      0.5_rk,                      &
+#endif
+#if COMPLEXCASE == 1
+                                      (0.5_rk, 0.0_rk), &
+#endif
+                                      (umc_dev+(cur_l_cols * n_cols )* &
+                                      size_of_datatype),   &
+                                      cur_l_cols, vav_dev,nbw,        &
+                                      ONE, umc_dev, cur_l_cols)
+         else
+           call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
+#if REALCASE == 1
+                                      -0.5_rk,                      &
+#endif
+#if COMPLEXCASE == 1
+                                      (-0.5_rk, 0.0_rk), &
+#endif
+                                      (umc_dev+(cur_l_cols * n_cols )* &
+                                      size_of_datatype),   &
+                                      cur_l_cols, vav_dev,nbw,        &
+                                      ONE, umc_dev, cur_l_cols)
+         endif
+         call obj%timer%stop("cublas")
+
+         successCUDA = cuda_memcpy(      &
+                                   int(loc(umcCUDA(1)),kind=c_intptr_t),    &
+                                   umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
+
+         if (.not.(successCUDA)) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error in cudaMemcpy umc_dev 6"
+           stop 1
+         endif
+
+         ! Transpose umc -> umr (stored in vmr, second half)
+         if (isSkewsymmetric) then
+           call elpa_transpose_vectors_ss_&
+                &MATH_DATATYPE&
+                &_&
+                &PRECISION &
+                            (obj, umcCUDA, cur_l_cols, mpi_comm_cols, &
+                             vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
+                             1, istep*nbw, n_cols, nblk, max_threads)
+         else
+           call elpa_transpose_vectors_&
+                &MATH_DATATYPE&
+                &_&
+                &PRECISION &
+                            (obj, umcCUDA, cur_l_cols, mpi_comm_cols, &
+                             vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
+                             1, istep*nbw, n_cols, nblk, max_threads)
+         endif
+
+         successCUDA = cuda_memcpy(vmr_dev,       &
+                                   int(loc(vmrCUDA(1)),kind=c_intptr_t),    &
+                                   vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
+         if (.not.(successCUDA)) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error in cudaMemcpy vmr_dev 5 "
+           stop 1
+         endif
+
+         successCUDA = cuda_memcpy(umc_dev,     &
+                                   int(loc(umcCUDA(1)),kind=c_intptr_t),    &
+                                   umc_size*size_of_datatype, cudaMemcpyHostToDevice)
+         if (.not.(successCUDA)) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error in cudaMemcpy umc_dev 7"
+           stop 1
+         endif
+       else ! useGPU
+         call obj%timer%start("blas")
+#if REALCASE == 1
+         if (isSkewsymmetric) then
+           call PRECISION_GEMM('N', 'N', int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND),     &
+                               0.5_rk, umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav,                        &
+                               int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND) )
+         else
+           call PRECISION_GEMM('N', 'N', int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND),     &
+                               -0.5_rk, umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav,                       &
+                               int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND) )
+         endif
+#endif
+#if COMPLEXCASE == 1
+         call PRECISION_GEMM('N', 'N', int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND),     &
+                              (-0.5_rk, 0.0_rk),     &
+                              umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav, &
+                              int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND))
+#endif
+
+         call obj%timer%stop("blas")
+         ! Transpose umc -> umr (stored in vmr, second half)
+         if (isSkewsymmetric) then
+           call elpa_transpose_vectors_ss_&
+             &MATH_DATATYPE&
+           &_&
+           &PRECISION &
+                                    (obj, umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
+                                           vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
+                                           1, istep*nbw, n_cols, nblk, max_threads)
+         else
+          call elpa_transpose_vectors_&
+          &MATH_DATATYPE&
+          &_&
+          &PRECISION &
+                                   (obj, umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
+                                             vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
+                                             1, istep*nbw, n_cols, nblk, max_threads)
+         endif
+       endif  ! useGPU
+
+       ! A = A - V*U**T - U*V**T
+
+#ifdef WITH_OPENMP
+       !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend  )
+       n_threads = omp_get_num_threads()
+
+       if (mod(n_threads, 2) == 0) then
+         n_way = 2
+       else
+         n_way = 1
+       endif
+
+       m_way = n_threads / n_way
+
+       m_id = mod(omp_get_thread_num(),  m_way)
+       n_id = omp_get_thread_num() / m_way
+
+       do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
+         i = ii / tile_size
+         lcs = i*l_cols_tile+1
+         lce = min(l_cols,(i+1)*l_cols_tile)
+         lre = min(l_rows,(i+1)*l_rows_tile)
+         if (lce<lcs .or. lre<1) cycle
+
+         !Figure out this thread's range
+         work_per_thread = lre / m_way
+         if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
+         mystart = m_id * work_per_thread + 1
+         myend   = mystart + work_per_thread - 1
+         if ( myend > lre ) myend = lre
+         if ( myend-mystart+1 < 1) cycle
+         call obj%timer%start("blas")
+         call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(myend-mystart+1,kind=BLAS_KIND), &
+                             int(lce-lcs+1,kind=BLAS_KIND), int(2*n_cols,kind=BLAS_KIND), -ONE, &
+                             vmrCPU(mystart, 1), int(ubound(vmrCPU,1),kind=BLAS_KIND), &
+                             umcCPU(lcs,1), int(ubound(umcCPU,1),kind=BLAS_KIND), &
+                             ONE, a_mat(mystart,lcs), int(ubound(a_mat,1),kind=BLAS_KIND) )
+          call obj%timer%stop("blas")
+       enddo
+       !$omp end parallel
+!#if COMPLEXCASE == 1
+!       do i=0,(istep*nbw-1)/tile_size
+!         lcs = i*l_cols_tile+1
+!         lce = min(l_cols,(i+1)*l_cols_tile)
+!         lre = min(l_rows,(i+1)*l_rows_tile)
+!         if (lce<lcs .or. lre<1) cycle
+!         call obj%timer%start("blas")
+!         call PRECISION_GEMM('N', 'C', lre,lce-lcs+1, 2*n_cols, -ONE, &
+!                       vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
+!                       ONE, a_mat(1,lcs), lda)
+!         call obj%timer%stop("blas")
+!       enddo
+!#endif
+
+#else /* WITH_OPENMP */
+
+       do i=0,(istep*nbw-1)/tile_size
+         lcs = i*l_cols_tile+1
+         lce = min(l_cols,(i+1)*l_cols_tile)
+         lre = min(l_rows,(i+1)*l_rows_tile)
+         if (lce<lcs .or. lre<1) cycle
+
+         if (useGPU) then
+           call obj%timer%start("cublas")
+
+           call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ,     &
+                                      lre, lce-lcs+1, 2*n_cols, -ONE, &
+                                      vmr_dev, cur_l_rows, (umc_dev +(lcs-1)*  &
+                                      size_of_datatype), &
+                                      cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
+                                      size_of_datatype), lda)
+           call obj%timer%stop("cublas")
+
+         else ! useGPU
+
+           call obj%timer%start("blas")
+           call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(lre,kind=BLAS_KIND),int(lce-lcs+1,kind=BLAS_KIND), &
+                               int(2*n_cols,kind=BLAS_KIND), &
+                               -ONE, &
+                               vmrCPU, int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), umcCPU(lcs,1), &
+                               int(ubound(umcCPU,dim=1),kind=BLAS_KIND), &
+                               ONE, a_mat(1,lcs), int(lda,kind=BLAS_KIND))
+           call obj%timer%stop("blas")
+         endif ! useGPU
+       enddo ! i=0,(istep*nbw-1)/tile_size
+#endif /* WITH_OPENMP */
+
+       if (.not.(useGPU)) then
+         if (allocated(vr)) then
+           deallocate(vr, stat=istat, errmsg=errorMessage)
+           if (istat .ne. 0) then
+             print *,"bandred_&
+                     &MATH_DATATYPE&
+                     &: error when deallocating vr "//errorMessage
+             stop 1
+           endif
+         endif
+
+         if (allocated(umcCPU)) then
+           deallocate(umcCPU, stat=istat, errmsg=errorMessage)
+           if (istat .ne. 0) then
+             print *,"bandred_&
+                     &MATH_DATATYPE&
+                     &: error when deallocating umcCPU "//errorMessage
+             stop 1
+           endif
+         endif
+
+         if (allocated(vmrCPU)) then
+           deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
+           if (istat .ne. 0) then
+             print *,"bandred_&
+                     &MATH_DATATYPE&
+                     &: error when deallocating vmrCPU "//errorMessage
+             stop 1
+           endif
+         endif
+       endif !useGPU
+
+     enddo ! istep - loop
+
+     if (useGPU) then
+       successCUDA = cuda_free(vav_dev)
+       if (.not.(successCUDA)) then
+         print *,"bandred_&
+                 &MATH_DATATYPE&
+                 &: error in cudaFree vav_dev 4"
+         stop 1
+       endif
+     endif ! useGPU
+
+     if (allocated(vr)) then
+       deallocate(vr, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"bandred_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating vr "//errorMessage
+         stop 1
+       endif
+     endif
+
+     if (allocated(umcCPU)) then
+       deallocate(umcCPU, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"bandred_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating umcCPU "//errorMessage
+         stop 1
+       endif
+     endif
+
+     if (allocated(vmrCPU)) then
+       deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"bandred_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating vmrCPU "//errorMessage
+         stop 1
+       endif
+     endif
+
+!#if COMPLEXCASE == 1
+!       ! check this
+!       if (useGPU) then
+!         successCUDA = cuda_free(umc_dev)
+!         if (.not.(successCUDA)) then
+!           print *,"bandred_complex: error in cudaFree umc_dev 7a"
+!           stop
+!         endif
+!       endif
+!#endif
+
+     if (useGPU) then
+       successCUDA = cuda_free(vmr_dev)
+       if (.not.(successCUDA)) then
+         print *,"bandred_&
+                 &MATH_DATATYPE&
+                 &: error in cudaFree vmr_dev 6"
+         stop 1
+       endif
+
+       successCUDA = cuda_free(umc_dev)
+       if (.not.(successCUDA)) then
+         print *,"bandred_&
+                 &MATH_DATATYPE&
+                 &: error in cudaFree umc_dev 8"
+         stop
+       endif
+
+       if (allocated(umcCUDA)) then
+         deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
+         if (istat .ne. 0) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error when deallocating umcCUDA "//errorMessage
+           stop 1
+         endif
+       endif
+       if (allocated(vmrCUDA)) then
+         deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
+         if (istat .ne. 0) then
+           print *,"bandred_&
+                   &MATH_DATATYPE&
+                   &: error when deallocating vmrCUDA "//errorMessage
+           stop 1
+         endif
+       endif
+     endif ! useGPU
+
+#if REALCASE == 1
+     if (useQR) then
+       if (which_qr_decomposition == 1) then
+         deallocate(work_blocked, stat=istat, errmsg=errorMessage)
+         if (istat .ne. 0) then
+           print *,"bandred_real: error when deallocating work_blocked "//errorMessage
+           stop 1
+         endif
+
+         deallocate(tauvector, stat=istat, errmsg=errorMessage)
+         if (istat .ne. 0) then
+           print *,"bandred_real: error when deallocating tauvector "//errorMessage
+           stop 1
+         endif
+       endif
+     endif
+#endif
+
+     if (useGPU) then
+       ! copy a_dev to a_mat 
+       ! we do it here, since a is needed on the host in the following routine
+       ! (band to tridi). Previously, a has been kept on the device and then
+       ! copied in redist_band (called from tridiag_band). However, it seems to
+       ! be easier to do it here. 
+       successCUDA = cuda_memcpy (int(loc(a_mat),kind=c_intptr_t), &
+       int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
+                                  cudaMemcpyDeviceToHost)
+       if (.not.(successCUDA)) then
+         print *,"bandred_&
+         &MATH_DATATYPE&
+         &: error in cudaMemcpy"
+         stop 1
+       endif
+     endif ! useGPU
+
+     call obj%timer%stop("bandred_&
+     &MATH_DATATYPE&
+     &" // &
+     &PRECISION_SUFFIX //&
+     gpuString)
+
+   end subroutine bandred_&
+   &MATH_DATATYPE&
+   &_&
+   &PRECISION
+#if REALCASE == 1
+   ! slower for gpu on 10000 10000 ???
+#endif
+
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_compute_complex_template.F90 elpa-2019.11.001/src/elpa2/elpa2_compute_complex_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_compute_complex_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_compute_complex_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,70 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA2 -- 2-stage solver for ELPA
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: Andreas Marek, MPCDF
+#endif
+
+#include "../general/sanity.F90"
+
+#define COMPLEXCASE 1
+#undef REALCASE
+#include "elpa2_bandred_template.F90"
+#undef COMPLEXCASE
+#define COMPLEXCASE 1
+#include "elpa2_herm_matrix_allreduce_complex_template.F90"
+#undef COMPLEXCASE
+#define COMPLEXCASE 1
+#include "elpa2_trans_ev_band_to_full_template.F90"
+#include "elpa2_tridiag_band_template.F90"
+#include "elpa2_trans_ev_tridi_to_band_template.F90"
+
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_compute.F90 elpa-2019.11.001/src/elpa2/elpa2_compute.F90
--- elpa-2016.05.001/src/elpa2/elpa2_compute.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_compute.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,143 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA2 -- 2-stage solver for ELPA
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+module ELPA2_compute
+
+! Version 1.1.2, 2011-02-21
+
+  use ELPA_utilities
+  USE ELPA1_compute
+  use elpa_pdgeqrf
+  use precision
+  use elpa_mpi
+  use aligned_mem
+
+  implicit none
+
+  PRIVATE ! By default, all routines contained are private
+
+  public :: bandred_real_double
+  public :: tridiag_band_real_double
+  public :: trans_ev_tridi_to_band_real_double
+  public :: trans_ev_band_to_full_real_double
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public :: bandred_real_single
+  public :: tridiag_band_real_single
+  public :: trans_ev_tridi_to_band_real_single
+  public :: trans_ev_band_to_full_real_single
+#endif
+
+  public :: bandred_complex_double
+  public :: tridiag_band_complex_double
+  public :: trans_ev_tridi_to_band_complex_double
+  public :: trans_ev_band_to_full_complex_double
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public :: bandred_complex_single
+  public :: tridiag_band_complex_single
+  public :: trans_ev_tridi_to_band_complex_single
+  public :: trans_ev_band_to_full_complex_single
+#endif
+  public :: band_band_real_double
+!  public :: divide_band
+
+  integer(kind=ik), public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
+                                                    ! 0 for unblocked
+                                                    ! 1 for blocked (maxrank: nblk)
+  contains
+
+! real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa2_compute_real_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+! real single precision
+#if defined(WANT_SINGLE_PRECISION_REAL)
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa2_compute_real_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+! complex double precision
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa2_compute_complex_template.F90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+! complex single precision
+#if defined(WANT_SINGLE_PRECISION_COMPLEX)
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "elpa2_compute_complex_template.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+end module ELPA2_compute
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_compute_real_template.F90 elpa-2019.11.001/src/elpa2/elpa2_compute_real_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_compute_real_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_compute_real_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,616 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! ELPA2 -- 2-stage solver for ELPA
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: Andreas Marek, MPCDF
+#endif
+
+#include "../general/sanity.F90"
+
+#define REALCASE 1
+#undef COMPLEXCASE
+#include "elpa2_bandred_template.F90"
+#define REALCASE 1
+#undef SKEW_SYMMETRIC_BUILD
+#include "elpa2_symm_matrix_allreduce_real_template.F90"
+#ifdef HAVE_SKEWSYMMETRIC
+#define SKEW_SYMMETRIC_BUILD
+#include "elpa2_symm_matrix_allreduce_real_template.F90"
+#undef SKEW_SYMMETRIC_BUILD
+#endif
+#undef REALCASE
+#define REALCASE 1
+#include "elpa2_trans_ev_band_to_full_template.F90"
+#include "elpa2_tridiag_band_template.F90"
+#include "elpa2_trans_ev_tridi_to_band_template.F90"
+
+
+
+    subroutine band_band_real_&
+&PRECISION &
+                  (obj, na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, communicator)
+    !-------------------------------------------------------------------------------
+    ! band_band_real:
+    ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
+    ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
+    ! zero entries, if this
+    ! requirement doesn't hold)
+    !
+    !  na          Order of matrix
+    !
+    !  nb          Semi bandwidth of original matrix
+    !
+    !  nb2         Semi bandwidth of target matrix
+    !
+    !  ab          Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
+    !              has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
+    !              are located on rank n.
+    !
+    !  ab2         Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
+    !              accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
+    !              on rank n.
+    !
+    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
+    !
+    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
+    !
+    !  communicator
+    !              MPI-Communicator for the total processor set
+    !-------------------------------------------------------------------------------
+      use elpa_abstract_impl
+      use elpa2_workload
+      use elpa_blas_interfaces
+
+      use precision
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)               :: na, nb, nbCol, nb2, nb2Col, communicator
+      real(kind=rk), intent(inout)               :: ab(2*nb,nbCol) ! removed assumed size
+      real(kind=rk), intent(inout)               :: ab2(2*nb2,nb2Col) ! removed assumed size
+      real(kind=rk), intent(out)                 :: d(na), e(na) ! set only on PE 0
+
+      real(kind=rk)                              :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
+                                                  tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
+
+      real(kind=rk)                              :: work(nb*nb2), work2(nb2*nb2)
+      integer(kind=ik)                         :: lwork, info
+      integer(kind=BLAS_KIND)                  :: infoBLAS
+
+      integer(kind=ik)                         :: istep, i, n, dest
+      integer(kind=ik)                         :: n_off, na_s
+      integer(kind=ik)                         :: my_pe, n_pes
+      integer(kind=MPI_KIND)                   :: my_peMPI, n_pesMPI, mpierr
+      integer(kind=ik)                         :: nblocks_total, nblocks
+      integer(kind=ik)                         :: nblocks_total2, nblocks2
+      integer(kind=MPI_KIND)                   :: ireq_ab, ireq_hv
+#ifdef WITH_MPI
+!      integer(kind=ik)                         :: MPI_STATUS_IGNORE(MPI_STATUS_SIZE)
+#endif
+!      integer(kind=ik), allocatable            :: mpi_statuses(:,:)
+      integer(kind=ik), allocatable            :: block_limits(:), block_limits2(:)
+      integer(kind=MPI_KIND), allocatable      :: ireq_ab2(:)
+
+      integer(kind=ik)                         :: j, nc, nr, ns, ne, iblk
+      integer(kind=ik)                         :: istat
+      character(200)                           :: errorMessage
+
+      call obj%timer%start("band_band_real" // PRECISION_SUFFIX)
+
+      call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(communicator,kind=MPI_KIND) ,my_peMPI ,mpierr)
+      call mpi_comm_size(int(communicator,kind=MPI_KIND) ,n_pesMPI ,mpierr)
+
+      my_pe = int(my_peMPI,kind=c_int)
+      n_pes = int(n_pesMPI,kind=c_int)
+      call obj%timer%stop("mpi_communication")
+
+      ! Total number of blocks in the band:
+      nblocks_total = (na-1)/nb + 1
+      nblocks_total2 = (na-1)/nb2 + 1
+
+      ! Set work distribution
+      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"error allocating block_limits "//errorMessage
+        stop 1
+      endif
+      call divide_band(obj, nblocks_total, n_pes, block_limits)
+
+      allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"error allocating block_limits2 "//errorMessage
+        stop 1
+      endif
+
+      call divide_band(obj, nblocks_total2, n_pes, block_limits2)
+
+      ! nblocks: the number of blocks for my task
+      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
+      nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)
+
+      allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"error allocating ireq_ab2 "//errorMessage
+        stop 1
+      endif
+
+#ifdef WITH_MPI
+      call obj%timer%start("mpi_communication")
+
+      ireq_ab2 = MPI_REQUEST_NULL
+
+      if (nb2>1) then
+        do i=0,nblocks2-1
+
+          call mpi_irecv(ab2(1,i*nb2+1), int(2*nb2*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, &
+                         0_MPI_KIND, 3_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab2(i+1), mpierr)
+        enddo
+      endif
+      call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+      ! carefull the "recieve" has to be done at the corresponding send or wait
+!      if (nb2>1) then
+!        do i=0,nblocks2-1
+!          ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2)
+!        enddo
+!      endif
+
+#endif /* WITH_MPI */
+      ! n_off: Offset of ab within band
+      n_off = block_limits(my_pe)*nb
+      lwork = nb*nb2
+      dest = 0
+#ifdef WITH_MPI
+      ireq_ab = MPI_REQUEST_NULL
+      ireq_hv = MPI_REQUEST_NULL
+#endif
+      ! ---------------------------------------------------------------------------
+      ! Start of calculations
+
+      na_s = block_limits(my_pe)*nb + 1
+
+      if (my_pe>0 .and. na_s<=na) then
+        ! send first nb2 columns to previous PE
+        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
+        do i=1,nb2
+          ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
+        enddo
+#ifdef WITH_MPI
+        call obj%timer%start("mpi_communication")
+
+        call mpi_isend(ab_s, int((nb+1)*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe-1,kind=MPI_KIND), &
+                       1_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
+        call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+      endif
+
+      do istep=1,na/nb2
+
+        if (my_pe==0) then
+
+          n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
+          hv(:,:) = 0.0_rk
+          tau(:) = 0.0_rk
+
+          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
+          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
+          if (istep < na/nb2) then
+
+            ! Transform first block column of remaining matrix
+      call obj%timer%start("blas")
+            call PRECISION_GEQRF(int(n,kind=BLAS_KIND), int(nb2,kind=BLAS_KIND), ab(1+nb2,na_s-n_off), &
+                                 int(2*nb-1,kind=BLAs_KIND), tau, work, int(lwork,kind=BLAS_KIND), &
+                                 infoBLAS)
+      info = int(infoBLAS,kind=ik)
+      call obj%timer%stop("blas")
+
+            do i=1,nb2
+              hv(i,i) = 1.0_rk
+              hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
+              ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0.0_rk
+            enddo
+
+          endif
+
+          if (nb2==1) then
+            d(istep) = ab(1,na_s-n_off)
+            e(istep) = ab(2,na_s-n_off)
+            if (istep == na) then
+              e(na) = 0.0_rk
+            endif
+          else
+            ab_s2 = 0.0_rk
+            ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
+            if (block_limits2(dest+1)<istep) then
+              dest = dest+1
+            endif
+#ifdef WITH_MPI
+            call obj%timer%start("mpi_communication")
+            call mpi_send(ab_s2, int(2*nb2*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(dest,kind=MPI_KIND), &
+                          3_MPI_KIND, int(communicator,kind=MPI_KIND), mpierr)
+            call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+            ! do irecv here
+            if (nb2>1) then
+              do i= 0,nblocks2-1
+                ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2)
+              enddo
+            endif
+#endif /* WITH_MPI */
+
+          endif
+
+        else
+          if (na>na_s+nb2-1) then
+            ! Receive Householder vectors from previous task, from PE owning subdiagonal
+#ifdef WITH_MPI
+            call obj%timer%start("mpi_communication")
+            call mpi_recv(hv, int(nb*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe-1,kind=MPI_KIND), &
+                          2_MPI_KIND, int(communicator,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+            call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+           hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
+#endif /* WITH_MPI */
+
+            do i=1,nb2
+              tau(i) = hv(i,i)
+              hv(i,i) = 1.0_rk
+            enddo
+          endif
+        endif
+
+        na_s = na_s+nb2
+        if (na_s-n_off > nb) then
+          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
+          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rk
+          n_off = n_off + nb
+        endif
+
+        do iblk=1,nblocks
+          ns = na_s + (iblk-1)*nb - n_off ! first column in block
+          ne = ns+nb-nb2                    ! last column in block
+
+          if (ns+n_off>na) exit
+
+            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
+            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
+                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
+            call wy_gen_&
+      &PRECISION&
+      &(obj,nc,nb2,w,hv,tau,work,nb)
+
+            if (iblk==nblocks .and. nc==nb) then
+              !request last nb2 columns
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call mpi_recv(ab_r, int((nb+1)*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe+1,kind=MPI_KIND), &
+                            1_MPI_KIND, int(communicator,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+              call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+             ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
+#endif /* WITH_MPI */
+              do i=1,nb2
+                ab(1:nb+1,ne+i-1) = ab_r(:,i)
+              enddo
+            endif
+            hv_new(:,:) = 0.0_rk ! Needed, last rows must be 0 for nr < nb
+            tau_new(:) = 0.0_rk
+
+            if (nr>0) then
+              call wy_right_&
+        &PRECISION&
+        &(obj,nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
+        call obj%timer%start("blas")
+              call PRECISION_GEQRF(int(nr,kind=BLAS_KIND), int(nb2,kind=BLAS_KIND), ab(nb+1,ns), &
+                                   int(2*nb-1,kind=BLAS_KIND), tau_new, work, int(lwork,kind=BLAS_KIND), &
+                                   infoBLAS)
+        info = int(infoBLAS,kind=ik)
+        call obj%timer%stop("blas")
+              do i=1,nb2
+                hv_new(i,i) = 1.0_rk
+                hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
+                ab(nb+2:,ns+i-1) = 0.0_rk
+              enddo
+
+              !send hh-Vector
+              if (iblk==nblocks) then
+#ifdef WITH_MPI
+                call obj%timer%start("mpi_communication")
+
+                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
+                call obj%timer%stop("mpi_communication")
+
+#endif
+                hv_s = hv_new
+                do i=1,nb2
+                  hv_s(i,i) = tau_new(i)
+                enddo
+#ifdef WITH_MPI
+                call obj%timer%start("mpi_communication")
+                call mpi_isend(hv_s, int(nb*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe+1,kind=MPI_KIND), &
+                               2_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_hv, mpierr)
+                call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+
+#endif /* WITH_MPI */
+              endif
+            endif
+
+            call wy_symm_&
+      &PRECISION&
+      &(obj,nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
+
+            if (my_pe>0 .and. iblk==1) then
+              !send first nb2 columns to previous PE
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+
+              call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
+              call obj%timer%stop("mpi_communication")
+
+#endif
+              do i=1,nb2
+                ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
+              enddo
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call mpi_isend(ab_s, int((nb+1)*nb2,kind=MPI_KIND), MPI_REAL_PRECISION, int(my_pe-1,kind=MPI_KIND), &
+                             1_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
+              call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+
+#endif /* WITH_MPI */
+            endif
+
+            if (nr>0) then
+              call wy_gen_&
+        &PRECISION&
+        &(obj,nr,nb2,w_new,hv_new,tau_new,work,nb)
+              call wy_left_&
+        &PRECISION&
+        &(obj,nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
+            endif
+
+            ! Use new HH Vector for the next block
+            hv(:,:) = hv_new(:,:)
+            tau = tau_new
+          enddo
+        enddo
+
+        ! Finish the last outstanding requests
+#ifdef WITH_MPI
+         call obj%timer%start("mpi_communication")
+
+        call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
+        call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
+!        allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage)
+!        if (istat .ne. 0) then
+!          print *,"error allocating mpi_statuses "//errorMessage
+!          stop 1
+!        endif
+
+        call mpi_waitall(nblocks2,ireq_ab2,MPI_STATUSES_IGNORE,mpierr)
+!        deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
+!        if (istat .ne. 0) then
+!          print *,"error deallocating mpi_statuses "//errorMessage
+!          stop 1
+!        endif
+
+        call mpi_barrier(int(communicator,kind=MPI_KIND) ,mpierr)
+        call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+
+        deallocate(block_limits, stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"error deallocating block_limits "//errorMessage
+          stop 1
+        endif
+
+        deallocate(block_limits2, stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"error deallocating block_limits2 "//errorMessage
+          stop 1
+        endif
+
+        deallocate(ireq_ab2, stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"error deallocating ireq_ab2 "//errorMessage
+          stop 1
+        endif
+
+        call obj%timer%stop("band_band_real" // PRECISION_SUFFIX)
+
+    end subroutine
+
+    subroutine wy_gen_&
+    &PRECISION&
+    &(obj, n, nb, W, Y, tau, mem, lda)
+
+      use elpa_abstract_impl
+      use elpa_blas_interfaces
+
+      use precision
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)            :: n      !length of householder-vectors
+      integer(kind=ik), intent(in)            :: nb     !number of householder-vectors
+      integer(kind=ik), intent(in)            :: lda        !leading dimension of Y and W
+      real(kind=rk), intent(in)               :: Y(lda,nb)  !matrix containing nb householder-vectors of length b
+      real(kind=rk), intent(in)               :: tau(nb)    !tau values
+      real(kind=rk), intent(out)              :: W(lda,nb)  !output matrix W
+      real(kind=rk), intent(in)               :: mem(nb)    !memory for a temporary matrix of size nb
+
+      integer(kind=ik)                        :: i
+
+   call obj%timer%start("wy_gen" // PRECISION_SUFFIX)
+
+   W(1:n,1) = tau(1)*Y(1:n,1)
+   do i=2,nb
+     W(1:n,i) = tau(i)*Y(1:n,i)
+     call obj%timer%start("blas")
+     call PRECISION_GEMV('T', int(n,kind=BLAS_KIND), int(i-1,kind=BLAS_KIND),  1.0_rk, Y, int(lda,kind=BLAS_KIND), &
+                         W(1,i), 1_BLAS_KIND, 0.0_rk, mem, 1_BLAS_KIND)
+     call PRECISION_GEMV('N', int(n,kind=BLAS_KIND), int(i-1,kind=BLAS_KIND), -1.0_rk, W, int(lda,kind=BLAS_KIND), &
+                         mem, 1_BLAS_KIND, 1.0_rk, W(1,i), 1_BLAS_KIND)
+     call obj%timer%stop("blas")
+   enddo
+   call obj%timer%stop("wy_gen" // PRECISION_SUFFIX)
+    end subroutine
+
+    subroutine wy_left_&
+    &PRECISION&
+    &(obj, n, m, nb, A, lda, W, Y, mem, lda2)
+
+      use precision
+      use elpa_abstract_impl
+      use elpa_blas_interfaces
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)            :: n      !width of the matrix A
+      integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
+      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
+      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
+      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
+      real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed   ! remove assumed size
+      real(kind=rk), intent(in)               :: W(m,nb)    !blocked transformation matrix W
+      real(kind=rk), intent(in)               :: Y(m,nb)    !blocked transformation matrix Y
+      real(kind=rk), intent(inout)            :: mem(n,nb)  !memory for a temporary matrix of size n x nb
+
+   call obj%timer%start("wy_left" // PRECISION_SUFFIX)
+   call obj%timer%start("blas")
+   call PRECISION_GEMM('T', 'N', int(nb,kind=BLAS_KIND), int(n,kind=BLAS_KIND), int(m,kind=BLAS_KIND), &
+                       1.0_rk, W, int(lda2,kind=BLAS_KIND), A, int(lda,kind=BLAS_KIND), 0.0_rk, mem, &
+                       int(nb,kind=BLAS_KIND))
+   call PRECISION_GEMM('N', 'N', int(m,kind=BLAS_KIND), int(n,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), &
+                       -1.0_rk, Y, int(lda2,kind=BLAS_KIND), mem, int(nb,kind=BLAS_KIND), 1.0_rk, A, int(lda,kind=BLAS_KIND))
+   call obj%timer%stop("blas")
+   call obj%timer%stop("wy_left" // PRECISION_SUFFIX)
+    end subroutine
+
+    subroutine wy_right_&
+    &PRECISION&
+    &(obj, n, m, nb, A, lda, W, Y, mem, lda2)
+
+      use precision
+      use elpa_abstract_impl
+      use elpa_blas_interfaces
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)            :: n      !height of the matrix A
+      integer(kind=ik), intent(in)            :: m      !length of matrix W and Y
+      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
+      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
+      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
+      real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed  ! remove assumed size
+      real(kind=rk), intent(in)               :: W(m,nb)    !blocked transformation matrix W
+      real(kind=rk), intent(in)               :: Y(m,nb)    !blocked transformation matrix Y
+      real(kind=rk), intent(inout)            :: mem(n,nb)  !memory for a temporary matrix of size n x nb
+
+
+      call obj%timer%start("wy_right" // PRECISION_SUFFIX)
+      call obj%timer%start("blas")
+      call PRECISION_GEMM('N', 'N', int(n,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), int(m,kind=BLAS_KIND), &
+                          1.0_rk, A, int(lda,kind=BLAS_KIND), W, int(lda2,kind=BLAS_KIND), 0.0_rk, mem, int(n,kind=BLAS_KIND))
+      call PRECISION_GEMM('N', 'T', int(n,kind=BLAS_KIND), int(m,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), &
+                          -1.0_rk, mem, int(n,kind=BLAS_KIND), Y, int(lda2,kind=BLAS_KIND), 1.0_rk, A, int(lda,kind=BLAS_KIND))
+      call obj%timer%stop("blas")
+      call obj%timer%stop("wy_right" // PRECISION_SUFFIX)
+
+    end subroutine
+
+    subroutine wy_symm_&
+    &PRECISION&
+    &(obj, n, nb, A, lda, W, Y, mem, mem2, lda2)
+
+      use elpa_abstract_impl
+      use elpa_blas_interfaces
+
+      use precision
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)            :: n      !width/heigth of the matrix A; length of matrix W and Y
+      integer(kind=ik), intent(in)            :: nb     !width of matrix W and Y
+      integer(kind=ik), intent(in)            :: lda        !leading dimension of A
+      integer(kind=ik), intent(in)            :: lda2       !leading dimension of W and Y
+      real(kind=rk), intent(inout)            :: A(lda,*)   !matrix to be transformed  ! remove assumed size
+      real(kind=rk), intent(in)               :: W(n,nb)    !blocked transformation matrix W
+      real(kind=rk), intent(in)               :: Y(n,nb)    !blocked transformation matrix Y
+      real(kind=rk)                           :: mem(n,nb)  !memory for a temporary matrix of size n x nb
+      real(kind=rk)                           :: mem2(nb,nb)    !memory for a temporary matrix of size nb x nb
+
+      call obj%timer%start("wy_symm" // PRECISION_SUFFIX)
+      call obj%timer%start("blas")
+      call PRECISION_SYMM('L', 'L', int(n, kind=BLAS_KIND), int(nb,kind=BLAS_KIND), 1.0_rk, A, &
+                          int(lda,kind=BLAS_KIND), W, int(lda2,kind=BLAS_KIND), 0.0_rk, mem, int(n,kind=BLAS_KIND))
+      call PRECISION_GEMM('T', 'N', int(nb,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), int(n,kind=BLAS_KIND), &
+                          1.0_rk, mem, int(n,kind=BLAS_KIND), W, int(lda2,kind=BLAS_KIND), 0.0_rk, mem2, &
+                          int(nb,kind=BLAS_KIND))
+      call PRECISION_GEMM('N', 'N', int(n,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), &
+                          -0.5_rk, Y, int(lda2,kind=BLAS_KIND), mem2, int(nb,kind=BLAS_KIND), 1.0_rk, mem, int(n,kind=BLAS_KIND))
+      call PRECISION_SYR2K('L', 'N',int(n,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), -1.0_rk, Y, int(lda2,kind=BLAS_KIND), &
+                           mem, int(n,kind=BLAS_KIND), 1.0_rk, A, int(lda,kind=BLAS_KIND))
+      call obj%timer%stop("blas")
+      call obj%timer%stop("wy_symm" // PRECISION_SUFFIX)
+
+    end subroutine
+
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_determine_workload.F90 elpa-2019.11.001/src/elpa2/elpa2_determine_workload.F90
--- elpa-2016.05.001/src/elpa2/elpa2_determine_workload.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_determine_workload.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,146 @@
+!   This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+
+
+! ELPA2 -- 2-stage solver for ELPA
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+#include "config-f90.h"
+
+module elpa2_workload
+
+  implicit none
+  private
+
+  public :: determine_workload
+  public :: divide_band
+
+  contains
+    subroutine determine_workload(obj, na, nb, nprocs, limits)
+      use elpa_abstract_impl
+      use precision
+      implicit none
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)  :: na, nb, nprocs
+      integer(kind=ik), intent(out) :: limits(0:nprocs)
+
+      integer(kind=ik)              :: i
+
+      call obj%timer%start("determine_workload")
+
+      if (na <= 0) then
+        limits(:) = 0
+
+        call obj%timer%stop("determine_workload")
+        return
+      endif
+
+      if (nb*nprocs > na) then
+          ! there is not enough work for all
+        do i = 0, nprocs
+          limits(i) = min(na, i*nb)
+        enddo
+      else
+         do i = 0, nprocs
+           limits(i) = (i*na)/nprocs
+         enddo
+      endif
+
+      call obj%timer%stop("determine_workload")
+    end subroutine
+    !---------------------------------------------------------------------------------------------------
+    ! divide_band: sets the work distribution in band
+    ! Proc n works on blocks block_limits(n)+1 .. block_limits(n+1)
+
+    subroutine divide_band(obj, nblocks_total, n_pes, block_limits)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik), intent(in)  :: nblocks_total ! total number of blocks in band
+      integer(kind=ik), intent(in)  :: n_pes         ! number of PEs for division
+      integer(kind=ik), intent(out) :: block_limits(0:n_pes)
+
+      integer(kind=ik)              :: n, nblocks, nblocks_left
+
+      call obj%timer%start("divide_band")
+
+      block_limits(0) = 0
+      if (nblocks_total < n_pes) then
+        ! Not enough work for all: The first tasks get exactly 1 block
+        do n=1,n_pes
+          block_limits(n) = min(nblocks_total,n)
+        enddo
+      else
+        ! Enough work for all. If there is no exact loadbalance,
+        ! the LAST tasks get more work since they are finishing earlier!
+        nblocks = nblocks_total/n_pes
+        nblocks_left = nblocks_total - n_pes*nblocks
+        do n=1,n_pes
+          if (n<=n_pes-nblocks_left) then
+            block_limits(n) = block_limits(n-1) + nblocks
+          else
+            block_limits(n) = block_limits(n-1) + nblocks + 1
+          endif
+        enddo
+      endif
+
+      call obj%timer%stop("divide_band")
+
+    end subroutine
+end module elpa2_workload
diff -Nru elpa-2016.05.001/src/elpa2/elpa2.F90 elpa-2019.11.001/src/elpa2/elpa2.F90
--- elpa-2016.05.001/src/elpa2/elpa2.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,271 @@
+!   This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA2 -- 2-stage solver for ELPA
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+!> \brief Fortran module which provides the routines to use the 2-stage ELPA solver. Implementation only. Should not be used directly
+module elpa2_impl
+  use elpa_utilities, only : error_unit
+#ifdef HAVE_LIKWID
+  use likwid
+#endif
+
+  implicit none
+
+  private
+
+  public :: elpa_solve_evp_real_2stage_double_impl          !< Driver routine for real double-precision 2-stage eigenvalue problem
+  public :: elpa_solve_evp_complex_2stage_double_impl       !< Driver routine for complex double-precision 2-stage eigenvalue problem
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public :: elpa_solve_evp_real_2stage_single_impl          !< Driver routine for real single-precision 2-stage eigenvalue problem
+#endif
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public :: elpa_solve_evp_complex_2stage_single_impl       !< Driver routine for complex single-precision 2-stage eigenvalue problem
+#endif
+
+  contains
+
+#define REALCASE 1
+
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+!-------------------------------------------------------------------------------
+!>  \brief elpa_solve_evp_real_2stage_double_impl: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach
+!>
+!>  Parameters
+!>
+!>  \param na                                   Order of matrix a
+!>
+!>  \param nev                                  Number of eigenvalues needed
+!>
+!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
+!>                                              Distribution is like in Scalapack.
+!>                                              The full matrix must be set (not only one half like in scalapack).
+!>                                              Destroyed on exit (upper and lower half).
+!>
+!>  \param lda                                  Leading dimension of a
+!>
+!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
+!>
+!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
+!>                                              Distribution is like in Scalapack.
+!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
+!>                                              even if only a part of the eigenvalues is needed.
+!>
+!>  \param ldq                                  Leading dimension of q
+!>
+!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
+!>
+!>  \param matrixCols                           local columns of matrix a and q
+!>
+!>  \param mpi_comm_rows                        MPI communicator for rows
+!>  \param mpi_comm_cols                        MPI communicator for columns
+!>  \param mpi_comm_all                         MPI communicator for the total processor set
+!>
+!>  \param kernel                               specify ELPA2 kernel to use
+!>
+!>  \param useQR (optional)                     use QR decomposition
+!>  \param useGPU (optional)                    decide whether to use GPUs or not
+!>
+!>  \result success                             logical, false if error occured
+!-------------------------------------------------------------------------------
+#include "elpa2_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+!-------------------------------------------------------------------------------
+!>  \brief elpa_solve_evp_real_2stage_single_impl: Fortran function to solve the single-precision real eigenvalue problem with a 2 stage approach
+!>
+!>  Parameters
+!>
+!>  \param na                                   Order of matrix a
+!>
+!>  \param nev                                  Number of eigenvalues needed
+!>
+!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
+!>                                              Distribution is like in Scalapack.
+!>                                              The full matrix must be set (not only one half like in scalapack).
+!>                                              Destroyed on exit (upper and lower half).
+!>
+!>  \param lda                                  Leading dimension of a
+!>
+!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
+!>
+!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
+!>                                              Distribution is like in Scalapack.
+!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
+!>                                              even if only a part of the eigenvalues is needed.
+!>
+!>  \param ldq                                  Leading dimension of q
+!>
+!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
+!>
+!>  \param matrixCols                           local columns of matrix a and q
+!>
+!>  \param mpi_comm_rows                        MPI communicator for rows
+!>  \param mpi_comm_cols                        MPI communicator for columns
+!>  \param mpi_comm_all                         MPI communicator for the total processor set
+!>
+!>  \param kernel                               specify ELPA2 kernel to use
+!>
+!>  \param useQR (optional)                     use QR decomposition
+!>  \param useGPU (optional)                    decide whether GPUs should be used or not
+!>
+!>  \result success                             logical, false if error occured
+!-------------------------------------------------------------------------------
+#include "elpa2_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+!>  \brief elpa_solve_evp_complex_2stage_double_impl: Fortran function to solve the double-precision complex eigenvalue problem with a 2 stage approach
+!>
+!>  Parameters
+!>
+!>  \param na                                   Order of matrix a
+!>
+!>  \param nev                                  Number of eigenvalues needed
+!>
+!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
+!>                                              Distribution is like in Scalapack.
+!>                                              The full matrix must be set (not only one half like in scalapack).
+!>                                              Destroyed on exit (upper and lower half).
+!>
+!>  \param lda                                  Leading dimension of a
+!>
+!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
+!>
+!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
+!>                                              Distribution is like in Scalapack.
+!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
+!>                                              even if only a part of the eigenvalues is needed.
+!>
+!>  \param ldq                                  Leading dimension of q
+!>
+!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
+!>
+!>  \param matrixCols                           local columns of matrix a and q
+!>
+!>  \param mpi_comm_rows                        MPI communicator for rows
+!>  \param mpi_comm_cols                        MPI communicator for columns
+!>  \param mpi_comm_all                         MPI communicator for the total processor set
+!>
+!>  \param kernel                               specify ELPA2 kernel to use
+!>  \param useGPU (optional)                    decide whether GPUs should be used or not
+!>
+!>  \result success                             logical, false if error occured
+!-------------------------------------------------------------------------------
+#include "elpa2_template.F90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+
+!>  \brief elpa_solve_evp_complex_2stage_single_impl: Fortran function to solve the single-precision complex eigenvalue problem with a 2 stage approach
+!>
+!>  Parameters
+!>
+!>  \param na                                   Order of matrix a
+!>
+!>  \param nev                                  Number of eigenvalues needed
+!>
+!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
+!>                                              Distribution is like in Scalapack.
+!>                                              The full matrix must be set (not only one half like in scalapack).
+!>                                              Destroyed on exit (upper and lower half).
+!>
+!>  \param lda                                  Leading dimension of a
+!>
+!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
+!>
+!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
+!>                                              Distribution is like in Scalapack.
+!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
+!>                                              even if only a part of the eigenvalues is needed.
+!>
+!>  \param ldq                                  Leading dimension of q
+!>
+!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
+!>
+!>  \param matrixCols                           local columns of matrix a and q
+!>
+!>  \param mpi_comm_rows                        MPI communicator for rows
+!>  \param mpi_comm_cols                        MPI communicator for columns
+!>  \param mpi_comm_all                         MPI communicator for the total processor set
+!>
+!>  \param kernel                               specify ELPA2 kernel to use
+!>  \param useGPU (optional)                    decide whether GPUs should be used or not
+!>
+!>  \result success                             logical, false if error occured
+!-------------------------------------------------------------------------------
+#include "elpa2_template.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+end module elpa2_impl
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90 elpa-2019.11.001/src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_herm_matrix_allreduce_complex_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,119 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+#include "../general/sanity.F90"
+
+     subroutine herm_matrix_allreduce_&
+     &PRECISION &
+                                    (obj, n, a, lda, ldb, comm)
+     !-------------------------------------------------------------------------------
+     !  herm_matrix_allreduce: Does an mpi_allreduce for a hermitian matrix A.
+     !  On entry, only the upper half of A needs to be set
+     !  On exit, the complete matrix is set
+      use elpa_abstract_impl
+      use precision
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik)               :: n, lda, ldb, comm
+      complex(kind=COMPLEX_DATATYPE) :: a(lda,ldb)
+
+      integer(kind=ik)               :: i, nc
+      integer(kind=MPI_KIND)         :: mpierr
+      complex(kind=COMPLEX_DATATYPE) :: h1(n*n), h2(n*n)
+
+       call obj%timer%start("herm_matrix_allreduce" // PRECISION_SUFFIX)
+
+       nc = 0
+       do i=1,n
+         h1(nc+1:nc+i) = a(1:i,i)
+         nc = nc+i
+       enddo
+#ifdef WITH_MPI
+       call obj%timer%start("mpi_communication")
+       call mpi_allreduce(h1, h2, int(nc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
+                          int(comm,kind=MPI_KIND), mpierr)
+       call obj%timer%stop("mpi_communication")
+
+       nc = 0
+       do i=1,n
+         a(1:i,i) = h2(nc+1:nc+i)
+         a(i,1:i-1) = conjg(a(1:i-1,i))
+         nc = nc+i
+       enddo
+
+
+#else /* WITH_MPI */
+!       h2(1:nc) = h1(1:nc)
+
+       nc = 0
+       do i=1,n
+         a(1:i,i) = h1(nc+1:nc+i)
+         a(i,1:i-1) = conjg(a(1:i-1,i))
+         nc = nc+i
+       enddo
+
+
+#endif /* WITH_MPI */
+
+!       nc = 0
+!       do i=1,n
+!         a(1:i,i) = h2(nc+1:nc+i)
+!         a(i,1:i-1) = conjg(a(1:i-1,i))
+!         nc = nc+i
+!       enddo
+
+       call obj%timer%stop("herm_matrix_allreduce" // PRECISION_SUFFIX)
+
+     end subroutine herm_matrix_allreduce_&
+     &PRECISION
+
+
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_print_kernels.F90 elpa-2019.11.001/src/elpa2/elpa2_print_kernels.F90
--- elpa-2016.05.001/src/elpa2/elpa2_print_kernels.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_print_kernels.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,154 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! ELPA2 -- 2-stage solver for ELPA
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! This file was written by A. Marek, MPCDF
+
+#include "config-f90.h"
+
+!> \file print_available_elpa2_kernels.F90
+!> \par
+!> \brief Provide information which ELPA2 kernels are available on this system
+!>
+!> \details
+!> It is possible to configure ELPA2 such, that different compute intensive
+!> "ELPA2 kernels" can be choosen at runtime.
+!> The service binary print_available_elpa2_kernels will query the library and tell
+!> whether ELPA2 has been configured in this way, and if this is the case which kernels can be
+!> choosen at runtime.
+!> It will furthermore detail whether ELPA has been configured with OpenMP support
+!>
+!> Synopsis: print_available_elpa2_kernels
+!>
+!> \author A. Marek (MPCDF)
+
+program print_available_elpa2_kernels
+   use elpa
+   use, intrinsic :: iso_c_binding
+
+   implicit none
+
+   integer(kind=c_int) :: i
+   class(elpa_t), pointer :: e
+   integer :: option, error
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "Unsupported ELPA API Version"
+     stop 1
+   endif
+
+   e => elpa_allocate(error)
+
+   print *, "This program will give information on the ELPA2 kernels, "
+   print *, "which are available with this library and it will give "
+   print *, "information if (and how) the kernels can be choosen at "
+   print *, "runtime"
+   print *
+#ifdef WITH_OPENMP
+   print *, " ELPA supports threads: yes"
+#else
+   print *, " ELPA supports threads: no"
+#endif
+   print *
+
+   print *, "Information on ELPA2 real case: "
+   print *, "=============================== "
+#ifdef HAVE_ENVIRONMENT_CHECKING
+   print *, " choice via environment variable: yes"
+   print *, " environment variable name      : ELPA_DEFAULT_real_kernel"
+#else
+   print *, " choice via environment variable: no"
+#endif
+   print *
+   print *, " Available real kernels are: "
+   print *
+   call print_options(e, "real_kernel")
+   print *
+   print *
+
+   print *, "Information on ELPA2 complex case: "
+   print *, "=============================== "
+#ifdef HAVE_ENVIRONMENT_CHECKING
+   print *, " choice via environment variable: yes"
+   print *, " environment variable name      : ELPA_DEFAULT_complex_kernel"
+#else
+   print *,  " choice via environment variable: no"
+#endif
+   print *
+   print *, " Available complex kernels are: "
+   print *
+   call print_options(e, "complex_kernel")
+   print *
+   print *
+
+   call elpa_deallocate(e, error)
+
+   contains
+
+     subroutine print_options(e, KERNEL_KEY)
+       class(elpa_t), intent(inout) :: e
+       character(len=*), intent(in) :: KERNEL_KEY
+       integer                      :: i, kernel,error
+
+       call e%set("solver",ELPA_SOLVER_2STAGE,error)
+
+       do i = 0, elpa_option_cardinality(KERNEL_KEY)
+         kernel = elpa_option_enumerate(KERNEL_KEY, i)
+         if (elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_COMPLEX_GPU" .or. &
+             elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_REAL_GPU") then
+           if (e%can_set("use_gpu",1) == ELPA_OK) then
+             call e%set("use_gpu",1, error)
+           endif
+         endif 
+
+         if (e%can_set(KERNEL_KEY, kernel) == ELPA_OK) then
+           print *, "  ", elpa_int_value_to_string(KERNEL_KEY, kernel)
+         endif
+       end do
+     end subroutine
+
+end program print_available_elpa2_kernels
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 elpa-2019.11.001/src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,145 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+#include "../general/sanity.F90"
+
+#undef ROUTINE_NAME
+#ifdef SKEW_SYMMETRIC_BUILD
+#define ROUTINE_NAME ssymm_matrix_allreduce
+#else
+#define ROUTINE_NAME symm_matrix_allreduce
+#endif
+
+
+#ifdef SKEW_SYMMETRIC_BUILD
+    subroutine ssymm_matrix_allreduce_&
+#else
+    subroutine symm_matrix_allreduce_&
+#endif
+&PRECISION &
+                    (obj, n, a, lda, ldb, comm)
+    !-------------------------------------------------------------------------------
+    !  symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
+    !  On entry, only the upper half of A needs to be set
+    !  On exit, the complete matrix is set
+    !-------------------------------------------------------------------------------
+      use elpa_abstract_impl
+      use precision
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik)             :: n, lda, ldb, comm
+#ifdef USE_ASSUMED_SIZE
+      real(kind=REAL_DATATYPE)     :: a(lda,*)
+#else
+      real(kind=REAL_DATATYPE)     :: a(lda,ldb)
+#endif
+      integer(kind=ik)             :: i, nc
+      integer(kind=MPI_KIND)       :: mpierr
+      real(kind=REAL_DATATYPE)     :: h1(n*n), h2(n*n)
+
+      call obj%timer%start("ROUTINE_NAME" // PRECISION_SUFFIX)
+
+      nc = 0
+      do i=1,n
+        h1(nc+1:nc+i) = a(1:i,i)
+        nc = nc+i
+      enddo
+
+#ifdef WITH_MPI
+      call obj%timer%start("mpi_communication")
+      call mpi_allreduce(h1, h2, int(nc,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, &
+                         int(comm,kind=MPI_KIND), mpierr)
+      call obj%timer%stop("mpi_communication")
+      nc = 0
+      do i=1,n
+        a(1:i,i) = h2(nc+1:nc+i)
+#ifdef SKEW_SYMMETRIC_BUILD
+        a(i,1:i-1) = - a(1:i-1,i)
+#else
+        a(i,1:i-1) = a(1:i-1,i)
+#endif
+        nc = nc+i
+      enddo
+
+#else /* WITH_MPI */
+!      h2=h1
+
+      nc = 0
+      do i=1,n
+        a(1:i,i) = h1(nc+1:nc+i)
+#ifdef SKEW_SYMMETRIC_BUILD
+        a(i,1:i-1) = - a(1:i-1,i)
+#else
+        a(i,1:i-1) = a(1:i-1,i)
+#endif
+        nc = nc+i
+      enddo
+
+#endif /* WITH_MPI */
+!      nc = 0
+!      do i=1,n
+!        a(1:i,i) = h2(nc+1:nc+i)
+!        a(i,1:i-1) = a(1:i-1,i)
+!        nc = nc+i
+!      enddo
+
+      call obj%timer%stop("ROUTINE_NAME" // PRECISION_SUFFIX)
+
+#ifdef SKEW_SYMMETRIC_BUILD
+    end subroutine ssymm_matrix_allreduce_&
+#else
+    end subroutine symm_matrix_allreduce_&
+#endif
+    &PRECISION
+
+
+
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_template.F90 elpa-2019.11.001/src/elpa2/elpa2_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1158 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+#include "elpa/elpa_simd_constants.h"
+
+ function elpa_solve_evp_&
+  &MATH_DATATYPE&
+  &_&
+  &2stage_&
+  &PRECISION&
+  &_impl (obj, a, ev, q) result(success)
+   use matrix_plot
+   use elpa_abstract_impl
+   use elpa_utilities
+   use elpa1_compute
+   use elpa2_compute
+   use elpa_mpi
+   use cuda_functions
+   use mod_check_for_gpu
+   use elpa_omp
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+   use simd_kernel
+#endif
+   use iso_c_binding
+   implicit none
+#include "../general/precision_kinds.F90"
+   class(elpa_abstract_impl_t), intent(inout)                         :: obj
+   logical                                                            :: useGPU
+   logical                                                            :: isSkewsymmetric
+#if REALCASE == 1
+   logical                                                            :: useQR
+   logical                                                            :: useQRActual
+#endif
+   integer(kind=c_int)                                                :: kernel, kernelByUser
+
+#ifdef USE_ASSUMED_SIZE
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,*)
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
+#else
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,obj%local_ncols)
+#ifdef HAVE_SKEWSYMMETRIC
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,2*obj%local_ncols)
+#else
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
+#endif
+#endif
+   real(kind=C_DATATYPE_KIND), intent(inout)                          :: ev(obj%na)
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: hh_trans(:,:)
+
+   integer(kind=c_int)                                                :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols
+   integer(kind=MPI_KIND)                                             :: my_peMPI, n_pesMPI, my_prowMPI, my_pcolMPI, &
+                                                                         np_rowsMPI, np_colsMPI, mpierr
+   integer(kind=c_int)                                                :: nbw, num_blocks
+#if COMPLEXCASE == 1
+   integer(kind=c_int)                                                :: l_cols_nev, l_rows, l_cols
+#endif
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: tmat(:,:,:)
+   real(kind=C_DATATYPE_KIND), allocatable                            :: e(:)
+#if COMPLEXCASE == 1
+   real(kind=C_DATATYPE_KIND), allocatable                            :: q_real(:,:)
+#endif
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target           :: q_dummy(:,:)
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer                       :: q_actual(:,:)
+
+
+   integer(kind=c_intptr_t)                                           :: tmat_dev, q_dev, a_dev
+
+   integer(kind=c_int)                                                :: i, j
+   logical                                                            :: success, successCUDA
+   logical                                                            :: wantDebug
+   integer(kind=c_int)                                                :: istat, gpu, skewsymmetric, debug, qr
+   character(200)                                                     :: errorMessage
+   logical                                                            :: do_useGPU, do_useGPU_bandred, &
+                                                                         do_useGPU_tridiag_band, do_useGPU_solve_tridi, &
+                                                                         do_useGPU_trans_ev_tridi_to_band, &
+                                                                         do_useGPU_trans_ev_band_to_full
+   integer(kind=c_int)                                                :: numberOfGPUDevices
+   integer(kind=c_intptr_t), parameter                                :: size_of_datatype = size_of_&
+                                                                                            &PRECISION&
+                                                                                            &_&
+                                                                                            &MATH_DATATYPE
+    integer(kind=ik)                                                  :: na, nev, lda, ldq, nblk, matrixCols, &
+                                                                         mpi_comm_rows, mpi_comm_cols,        &
+                                                                         mpi_comm_all, check_pd, error
+
+    logical                                                           :: do_bandred, do_tridiag, do_solve_tridi,  &
+                                                                         do_trans_to_band, do_trans_to_full
+
+    integer(kind=ik)                                                  :: nrThreads
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+    integer(kind=c_int)                                               :: simdSetAvailable(NUMBER_OF_INSTR)
+#endif
+    integer(kind=ik)                                                  :: global_index
+
+#if REALCASE == 1
+#undef GPU_KERNEL
+#undef GENERIC_KERNEL
+#undef KERNEL_STRING
+#define GPU_KERNEL ELPA_2STAGE_REAL_GPU
+#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
+#define KERNEL_STRING "real_kernel"
+#endif
+#if COMPLEXCASE == 1
+#undef GPU_KERNEL
+#undef GENERIC_KERNEL
+#undef KERNEL_STRING
+#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU
+#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
+#define KERNEL_STRING "complex_kernel"
+#endif
+
+    call obj%timer%start("elpa_solve_evp_&
+    &MATH_DATATYPE&
+    &_2stage_&
+    &PRECISION&
+    &")
+
+
+#ifdef WITH_OPENMP
+    ! store the number of OpenMP threads used in the calling function
+    ! restore this at the end of ELPA 2
+    omp_threads_caller = omp_get_max_threads()
+
+    ! check the number of threads that ELPA should use internally
+    call obj%get("omp_threads",nrThreads,error)
+    call omp_set_num_threads(nrThreads)
+#else
+    nrThreads = 1
+#endif
+
+    success = .true.
+
+    if (present(q)) then
+      obj%eigenvalues_only = .false.
+    else
+      obj%eigenvalues_only = .true.
+    endif
+
+    na         = obj%na
+    nev        = obj%nev
+    lda        = obj%local_nrows
+    ldq        = obj%local_nrows
+    nblk       = obj%nblk
+    matrixCols = obj%local_ncols
+
+    call obj%get("mpi_comm_rows",mpi_comm_rows,error)
+    if (error .ne. ELPA_OK) then
+      print *,"Problem getting option. Aborting..."
+      stop
+    endif
+    call obj%get("mpi_comm_cols",mpi_comm_cols,error)
+    if (error .ne. ELPA_OK) then
+      print *,"Problem getting option. Aborting..."
+      stop
+    endif
+    call obj%get("mpi_comm_parent",mpi_comm_all,error)
+    if (error .ne. ELPA_OK) then
+      print *,"Problem getting option. Aborting..."
+      stop
+    endif
+
+    call obj%timer%start("mpi_communication")
+    call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND) ,my_peMPI ,mpierr)
+    call mpi_comm_size(int(mpi_comm_all,kind=MPI_KIND) ,n_pesMPI ,mpierr)
+
+    call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
+    call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
+    call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
+    call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
+
+    my_pe = int(my_peMPI, kind=c_int)
+    n_pes = int(n_pesMPI, kind=c_int)
+    my_prow = int(my_prowMPI, kind=c_int)
+    np_rows = int(np_rowsMPI, kind=c_int)
+    my_pcol = int(my_pcolMPI, kind=c_int)
+    np_cols = int(np_colsMPI, kind=c_int)
+
+    call obj%timer%stop("mpi_communication")
+
+   ! special case na = 1
+   if (na .eq. 1) then
+#if REALCASE == 1
+     ev(1) = a(1,1)
+#endif
+#if COMPLEXCASE == 1
+     ev(1) = real(a(1,1))
+#endif
+     if (.not.(obj%eigenvalues_only)) then
+       q(1,1) = ONE
+     endif
+
+     ! restore original OpenMP settings
+#ifdef WITH_OPENMP
+     ! store the number of OpenMP threads used in the calling function
+     ! restore this at the end of ELPA 2
+     call omp_set_num_threads(omp_threads_caller)
+#endif
+
+     call obj%timer%stop("elpa_solve_evp_&
+     &MATH_DATATYPE&
+     &_2stage_&
+     &PRECISION&
+     &")
+     return
+   endif
+
+   if (nev == 0) then
+     nev = 1
+     obj%eigenvalues_only = .true.
+   endif
+
+    call obj%get(KERNEL_STRING,kernel,error)
+    if (error .ne. ELPA_OK) then
+      print *,"Problem getting option. Aborting..."
+      stop
+    endif
+ 
+    call obj%get("is_skewsymmetric",skewsymmetric,error)
+    if (error .ne. ELPA_OK) then
+      print *,"Problem getting option. Aborting..."
+      stop
+    endif
+
+    isSkewsymmetric = (skewsymmetric == 1)
+
+    ! GPU settings
+    call obj%get("gpu", gpu,error)
+    if (error .ne. ELPA_OK) then
+      print *,"Problem getting option. Aborting..."
+      stop
+    endif
+
+    useGPU = (gpu == 1)
+
+    do_useGPU = .false.
+    if (useGPU) then
+      call obj%timer%start("check_for_gpu")
+      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
+
+         do_useGPU = .true.
+         a_dev = 0
+
+         ! set the neccessary parameters
+         cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
+         cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
+         cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
+         cudaHostRegisterPortable = cuda_hostRegisterPortable()
+         cudaHostRegisterMapped   = cuda_hostRegisterMapped()
+      else
+        print *,"GPUs are requested but not detected! Aborting..."
+        success = .false.
+        return
+      endif
+      call obj%timer%stop("check_for_gpu")
+    endif
+
+    do_useGPU_bandred = do_useGPU
+    ! tridiag-band not ported to GPU yet
+    do_useGPU_tridiag_band = .false.
+    do_useGPU_solve_tridi = do_useGPU
+    ! trans tridi to band GPU implementation does not work properly
+    do_useGPU_trans_ev_tridi_to_band = .false.
+    do_useGPU_trans_ev_band_to_full = do_useGPU
+
+    ! only if we want (and can) use GPU in general, look what are the
+    ! requirements for individual routines. Implicitly they are all set to 1, so
+    ! unles specified otherwise by the user, GPU versions of all individual
+    ! routines should be used
+    if(do_useGPU) then
+      call obj%get("gpu_bandred", gpu, error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+      do_useGPU_bandred = (gpu == 1)
+
+!      call obj%get("gpu_tridiag_band", gpu, error)
+!      if (error .ne. ELPA_OK) then
+!        print *,"Problem getting option. Aborting..."
+!        stop
+!      endif
+!      do_useGPU_tridiag_band = (gpu == 1)
+    ! tridiag-band not ported to GPU yet
+      do_useGPU_tridiag_band = .false.
+
+      call obj%get("gpu_solve_tridi", gpu, error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+      do_useGPU_solve_tridi = (gpu == 1)
+
+!      call obj%get("gpu_trans_ev_tridi_to_band", gpu, error)
+!      if (error .ne. ELPA_OK) then
+!        print *,"Problem getting option. Aborting..."
+!        stop
+!      endif
+!      do_useGPU_trans_ev_tridi_to_band = (gpu == 1)
+      do_useGPU_trans_ev_tridi_to_band = .false.
+
+      call obj%get("gpu_trans_ev_band_to_full", gpu, error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+      do_useGPU_trans_ev_band_to_full = (gpu == 1)
+    endif
+
+    ! check consistency between request for GPUs and defined kernel
+    if (do_useGPU_trans_ev_tridi_to_band) then
+    !!! this currently cannot happen,  GPU_trans_ev_tridi_to_band is always false
+        write(error_unit,*) "ELPA: internal error!"
+        stop
+!      if (kernel .ne. GPU_KERNEL) then
+!        write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
+!        write(error_unit,*) "The compute kernel will be executed on CPUs!"
+!        do_useGPU_trans_ev_tridi_to_band = .false.
+!      else if (nblk .ne. 128) then
+!        write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
+!        write(error_unit,*) "The compute kernel will be executed on CPUs!"
+!        do_useGPU_trans_ev_tridi_to_band = .false.
+!        kernel = GENERIC_KERNEL
+!      endif
+    else
+      if (kernel .eq. GPU_KERNEL) then
+        ! We have currently forbidden to use GPU version of trans ev tridi to band, but we did not forbid the possibility
+        ! to select the GPU kernel. If done such, give warning and swicht to the  generic kernel
+        ! TODO it would be better to forbid the possibility to set the GPU kernel completely
+        write(error_unit,*) "ELPA: ERROR, GPU kernel currently not implemented.&
+                             & Use optimized CPU kernel even for GPU runs! &
+                           Switching to the non-optimized generic kernel"
+        kernel = GENERIC_KERNEL
+      endif
+    endif
+
+    ! check again, now kernel and do_useGPU_trans_ev_tridi_to_band sould be
+    ! finally consistent
+    if (do_useGPU_trans_ev_tridi_to_band) then
+    !!! this currently cannot happen,  GPU_trans_ev_tridi_to_band is always false
+      write(error_unit,*) "ELPA: internal error!"
+      stop
+!      if (kernel .ne. GPU_KERNEL) then
+!        ! this should never happen, checking as an assert
+!        write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel!  Aborting..."
+!        stop
+!      endif
+!      if (nblk .ne. 128) then
+!        ! this should never happen, checking as an assert
+!        write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize!  Aborting..."
+!        stop
+!      endif
+    else
+      if (kernel .eq. GPU_KERNEL) then
+        ! combination not allowed
+        write(error_unit,*) "ELPA: Warning, GPU usage has NOT been requested but compute kernel &
+                            &is defined as the GPU kernel!  Aborting..."
+        stop
+        !TODO do error handling properly
+      endif
+    endif
+
+
+#if REALCASE == 1
+#ifdef SINGLE_PRECISION_REAL
+    ! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
+    if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
+        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4 .or. &
+        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6        ) then
+        write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for POWER8"
+        write(error_unit,*) "The GENERIC kernel will be used at the moment"
+        kernel = ELPA_2STAGE_REAL_GENERIC
+    endif
+    ! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
+    if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
+        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
+        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6        ) then
+        write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
+        write(error_unit,*) "The GENERIC kernel will be used at the moment"
+        kernel = ELPA_2STAGE_REAL_GENERIC
+    endif
+#endif
+
+#endif
+
+     ! consistency check: is user set kernel still identical with "kernel" or did
+     ! we change it above? This is a mess and should be cleaned up
+     call obj%get(KERNEL_STRING,kernelByUser,error)
+     if (error .ne. ELPA_OK) then
+       print *,"Problem getting option. Aborting..."
+       stop
+     endif
+
+     if (kernelByUser .ne. kernel) then
+       call obj%set(KERNEL_STRING, kernel, error)
+       if (error .ne. ELPA_OK) then
+         print *,"Problem setting option. Aborting..."
+         stop
+       endif
+     endif
+
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+     ! find a kernel which is supported on all used CPUs
+     ! at the moment this works only on Intel CPUs
+     simdSetAvailable(:) = 0
+     call get_cpuid_set(simdSetAvailable, NUMBER_OF_INSTR)
+#ifdef WITH_MPI
+     call MPI_ALLREDUCE(mpi_in_place, simdSetAvailable, NUMBER_OF_INSTR, MPI_INTEGER, MPI_BAND, int(mpi_comm_all,kind=MPI_KIND), mpierr)
+#endif
+
+     ! compare user chosen kernel with possible kernels
+     call obj%get(KERNEL_STRING,kernelByUser,error)
+     if (error .ne. ELPA_OK) then
+       print *,"Problem getting option. Aborting..."
+       stop
+     endif
+
+     ! map kernel to SIMD Set, and check whether this is set is available on all cores
+
+#if REALCASE == 1
+    if (simdSetAvailable(map_real_kernel_to_simd_instruction(kernelByUser)) /= 1) then
+#endif
+#if COMPLEXCASE == 1
+    if (simdSetAvailable(map_complex_kernel_to_simd_instruction(kernelByUser)) /=1) then
+#endif
+
+      ! if we are not purely running on Intel CPUs, this feature does not work at the moment
+      ! this restriction should be lifted step by step
+      if (simdSetAvailable(CPU_MANUFACTURER) /= 1) then
+         if (my_pe == 0 ) then
+         write(error_unit,*) "You enabled the experimental feature of an heterogenous cluster support."
+         write(error_unit,*) "However, this works at the moment only if ELPA is run on (different) Intel CPUs!"
+         write(error_unit,*) "ELPA detected also non Intel-CPUs, and will this abort now"
+         stop
+        endif
+      else
+        if (my_pe == 0 ) then
+          write(error_unit,*) "The ELPA 2stage kernel of your choice, cannot be run on all CPUs"
+          write(error_unit,*) "ELPA will use another kernel..."
+        endif
+
+        ! find best kernel available for supported instruction sets
+        do i = NUMBER_OF_INSTR, 2, -1
+          if (simdSetAvailable(i) == 1) then
+            ! map to "best" kernel with this instruction set
+            ! this can be only done for kernels that ELPA has been configured to use
+#if REALCASE == 1
+            kernel = map_simd_instruction_to_real_kernel(i)
+#endif
+#if COMPLEXCASE == 1
+            kernel = map_simd_instruction_to_complex_kernel(i)
+#endif
+            if (obj%can_set(KERNEL_STRING, kernel) == ELPA_OK) then
+              call obj%set(KERNEL_STRING, kernel, error)
+              if (error .ne. ELPA_OK) then
+                print *,"Problem setting option. Aborting..."
+                stop
+              endif
+              if (my_pe == 0 ) write(error_unit,*) "ELPA decided to use ",elpa_int_value_to_string(KERNEL_STRING, kernel)
+              exit 
+            endif
+          endif
+        enddo
+      endif
+
+    endif
+#endif /* HAVE_HETEROGENOUS_CLUSTER_SUPPORT */
+
+#if REALCASE == 1
+    call obj%get("qr",qr,error)
+    if (error .ne. ELPA_OK) then
+      print *,"Problem getting option. Aborting..."
+      stop
+    endif
+    if (qr .eq. 1) then
+      useQR = .true.
+    else
+      useQR = .false.
+    endif
+
+#endif
+
+    call obj%get("debug",debug,error)
+    if (error .ne. ELPA_OK) then
+      print *,"Problem getting option. Aborting..."
+      stop
+    endif
+    wantDebug = debug == 1
+
+
+
+#if REALCASE == 1
+    useQRActual = .false.
+    ! set usage of qr decomposition via API call
+    if (useQR) useQRActual = .true.
+    if (.not.(useQR)) useQRACtual = .false.
+
+    if (useQRActual) then
+      if (mod(na,2) .ne. 0) then
+        if (wantDebug) then
+          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
+        endif
+        print *, "Do not use QR-decomposition for this matrix and blocksize."
+        success = .false.
+        return
+      endif
+    endif
+#endif /* REALCASE */
+
+
+
+    if (.not. obj%eigenvalues_only) then
+      q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
+    else
+     allocate(q_dummy(1:obj%local_nrows,1:obj%local_ncols))
+     q_actual => q_dummy(1:obj%local_nrows,1:obj%local_ncols)
+    endif
+
+
+    ! set the default values for each of the 5 compute steps
+    do_bandred        = .true.
+    do_tridiag        = .true.
+    do_solve_tridi    = .true.
+    do_trans_to_band  = .true.
+    do_trans_to_full  = .true.
+
+    if (obj%eigenvalues_only) then
+      do_trans_to_band  = .false.
+      do_trans_to_full  = .false.
+    endif
+
+    if (obj%is_set("bandwidth") == 1) then
+      ! bandwidth is set. That means, that the inputed matrix is actually banded and thus the 
+      ! first step of ELPA2 should be skipped
+      call obj%get("bandwidth",nbw,error)
+      if (nbw == 0) then
+        if (wantDebug) then
+          write(error_unit,*) "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
+                              "for a diagonal matrix! This is too simple"
+          endif
+        print *, "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
+                 "for a diagonal matrix! This is too simple"
+        success = .false.
+        return
+      endif
+      if (mod(nbw, nblk) .ne. 0) then
+        ! treat matrix with an effective bandwidth slightly bigger than specified bandwidth
+        ! such that effective bandwidth is a multiply of nblk. which is a prerequiste for ELPA
+        nbw = nblk * ceiling(real(nbw,kind=c_double)/real(nblk,kind=c_double))
+
+        ! just check that effective bandwidth is NOT larger than matrix size
+        if (nbw .gt. na) then
+          if (wantDebug) then
+            write(error_unit,*) "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
+                                "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
+                                "solve your problem by not specifing a bandwidth"
+          endif
+          print *, "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
+                                "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
+                                "solve your problem by not specifing a bandwidth"
+          success = .false.
+          return
+        endif
+      endif
+      do_bandred       = .false. ! we already have a banded matrix
+      do_solve_tridi   = .true.  ! we also have to solve something :-)
+      do_trans_to_band = .true.  ! and still we have to backsub to banded
+      do_trans_to_full = .false. ! but not to full since we have a banded matrix
+    else ! matrix is not banded, determine the intermediate bandwidth for full->banded->tridi
+      !first check if the intermediate bandwidth was set by the user
+      call obj%get("intermediate_bandwidth", nbw, error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+
+      if(nbw == 0) then
+        ! intermediate bandwidth was not specified, select one of the defaults
+
+        ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
+        ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
+        ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
+        ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
+        ! on this and maybe allow a run-time optimization here
+        if (do_useGPU) then
+          nbw = nblk
+        else
+#if REALCASE == 1
+          nbw = (63/nblk+1)*nblk
+#elif COMPLEXCASE == 1
+          nbw = (31/nblk+1)*nblk
+#endif
+        endif
+
+      else
+        ! intermediate bandwidth has been specified by the user, check, whether correctly
+        if (mod(nbw, nblk) .ne. 0) then
+          print *, "Specified bandwidth ",nbw," has to be mutiple of the blocksize ", nblk, ". Aborting..."
+          success = .false.
+          return
+        endif
+      endif !nbw == 0
+
+      num_blocks = (na-1)/nbw + 1
+
+      ! tmat is needed only in full->band and band->full steps, so alocate here
+      ! (not allocated for banded matrix on input)
+      allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"solve_evp_&
+        &MATH_DATATYPE&
+        &_2stage_&
+        &PRECISION&
+        &" // ": error when allocating tmat "//errorMessage
+        stop 1
+      endif
+
+      ! if either of full->band or band->full steps are to be done on GPU,
+      ! allocate also corresponding array on GPU.
+      if (do_useGPU_bandred .or.  do_useGPU_trans_ev_band_to_full) then
+        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
+        if (.not.(successCUDA)) then
+          print *,"bandred_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMalloc tmat_dev 1"
+          stop 1
+        endif
+      endif
+
+      do_bandred       = .true.
+      do_solve_tridi   = .true.
+      do_trans_to_band = .true.
+      do_trans_to_full = .true.
+    endif  ! matrix not already banded on input
+
+    ! start the computations in 5 steps
+!     print *
+!     print *, 'do_useGPU_bandred', do_useGPU_bandred
+!     print *, 'do_useGPU_tridiag_band', do_useGPU_tridiag_band
+!     print *, 'do_useGPU_solve_tridi', do_useGPU_solve_tridi
+!     print *, 'do_useGPU_trans_ev_tridi_to_band', do_useGPU_trans_ev_tridi_to_band
+!     print *, 'do_useGPU_trans_ev_band_to_full', do_useGPU_trans_ev_band_to_full
+!     print *
+    if (do_bandred) then
+!       print *, 'do_useGPU_bandred=', do_useGPU_bandred
+      call obj%timer%start("bandred")
+#ifdef HAVE_LIKWID
+      call likwid_markerStartRegion("bandred")
+#endif
+      ! Reduction full -> band
+      call bandred_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION &
+      (obj, na, a, &
+      a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
+      tmat_dev,  wantDebug, do_useGPU_bandred, success, &
+#if REALCASE == 1
+      useQRActual, &
+#endif
+       nrThreads)
+#ifdef HAVE_LIKWID
+      call likwid_markerStopRegion("bandred")
+#endif
+      call obj%timer%stop("bandred")
+      if (.not.(success)) return
+    endif
+
+
+     ! Reduction band -> tridiagonal
+     if (do_tridiag) then
+       allocate(e(na), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"solve_evp_&
+         &MATH_DATATYPE&
+         &_2stage_&
+         &PRECISION " // ": error when allocating e "//errorMessage
+         stop 1
+       endif
+
+       call obj%timer%start("tridiag")
+#ifdef HAVE_LIKWID
+       call likwid_markerStartRegion("tridiag")
+#endif
+       call tridiag_band_&
+       &MATH_DATATYPE&
+       &_&
+       &PRECISION&
+       (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
+       do_useGPU_tridiag_band, wantDebug, nrThreads)
+
+#ifdef WITH_MPI
+       call obj%timer%start("mpi_communication")
+       call mpi_bcast(ev, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, 0_MPI_KIND, int(mpi_comm_all,kind=MPI_KIND), mpierr)
+       call mpi_bcast(e, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, 0_MPI_KIND, int(mpi_comm_all,kind=MPI_KIND), mpierr)
+       call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+#ifdef HAVE_LIKWID
+       call likwid_markerStopRegion("tridiag")
+#endif
+       call obj%timer%stop("tridiag")
+     endif ! do_tridiag
+
+#if COMPLEXCASE == 1
+     l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
+     l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
+     l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
+
+     allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"solve_evp_&
+       &MATH_DATATYPE&
+       &_2stage: error when allocating q_real"//errorMessage
+       stop 1
+     endif
+#endif
+
+     ! Solve tridiagonal system
+     if (do_solve_tridi) then
+!        print *, 'do_useGPU_solve_tridi=', do_useGPU_solve_tridi
+       call obj%timer%start("solve")
+#ifdef HAVE_LIKWID
+       call likwid_markerStartRegion("solve")
+#endif
+       call solve_tridi_&
+       &PRECISION &
+       (obj, na, nev, ev, e, &
+#if REALCASE == 1
+       q_actual, ldq,   &
+#endif
+#if COMPLEXCASE == 1
+       q_real, ubound(q_real,dim=1), &
+#endif
+       nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads)
+#ifdef HAVE_LIKWID
+       call likwid_markerStopRegion("solve")
+#endif
+       call obj%timer%stop("solve")
+       if (.not.(success)) return
+     endif ! do_solve_tridi
+
+     deallocate(e, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"solve_evp_&
+       &MATH_DATATYPE&
+       &_2stage: error when deallocating e "//errorMessage
+       stop 1
+     endif
+
+     if (obj%eigenvalues_only) then
+       do_trans_to_band = .false.
+       do_trans_to_full = .false.
+     else
+
+       call obj%get("check_pd",check_pd,error)
+       if (error .ne. ELPA_OK) then
+         print *,"Problem getting option. Aborting..."
+         stop
+       endif
+       if (check_pd .eq. 1) then
+         check_pd = 0
+         do i = 1, na
+           if (ev(i) .gt. THRESHOLD) then
+             check_pd = check_pd + 1
+           endif
+         enddo
+         if (check_pd .lt. na) then
+           ! not positiv definite => eigenvectors needed
+           do_trans_to_band = .true.
+           do_trans_to_full = .true.
+         else
+           do_trans_to_band = .false.
+           do_trans_to_full = .false.
+         endif
+       endif
+     endif ! eigenvalues only
+
+     if (do_trans_to_band) then
+#if COMPLEXCASE == 1
+       ! q must be given thats why from here on we can use q and not q_actual
+
+       q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
+
+       deallocate(q_real, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"solve_evp_&
+         &MATH_DATATYPE&
+         &_2stage: error when deallocating q_real"//errorMessage
+         stop 1
+       endif
+#endif
+     endif
+ 
+       if (isSkewsymmetric) then
+       ! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
+       ! This makes the eigenvectors complex.
+       ! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
+         q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols) = 0.0
+         do i = 1, obj%local_nrows
+!          global_index = indxl2g(i, nblk, my_prow, 0, np_rows)
+           global_index = np_rows*nblk*((i-1)/nblk) + MOD(i-1,nblk) + MOD(np_rows+my_prow-0, np_rows)*nblk + 1
+           if (mod(global_index-1,4) .eq. 0) then
+              ! do nothing
+           end if
+           if (mod(global_index-1,4) .eq. 1) then
+              q(i,obj%local_ncols+1:2*obj%local_ncols) = q(i,1:obj%local_ncols)
+              q(i,1:obj%local_ncols) = 0
+           end if
+           if (mod(global_index-1,4) .eq. 2) then
+              q(i,1:obj%local_ncols) = -q(i,1:obj%local_ncols)
+           end if
+           if (mod(global_index-1,4) .eq. 3) then
+              q(i,obj%local_ncols+1:2*obj%local_ncols) = -q(i,1:obj%local_ncols)
+              q(i,1:obj%local_ncols) = 0
+           end if
+         end do
+       endif
+!        print * , "q="
+!        do i=1,na
+!          write(*,"(100g15.5)") ( q(i,j), j=1,na )
+!        enddo
+       ! Backtransform stage 1
+     if (do_trans_to_band) then
+       call obj%timer%start("trans_ev_to_band")
+#ifdef HAVE_LIKWID
+       call likwid_markerStartRegion("trans_ev_to_band")
+#endif
+
+       ! In the skew-symmetric case this transforms the real part
+       call trans_ev_tridi_to_band_&
+       &MATH_DATATYPE&
+       &_&
+       &PRECISION &
+       (obj, na, nev, nblk, nbw, q, &
+       q_dev, &
+       ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
+       nrThreads, success=success, kernel=kernel)
+!        if (isSkewsymmetric) then
+!        ! Transform imaginary part
+!        ! Transformation of real and imaginary part could also be one call of trans_ev_tridi acting on the n x 2n matrix.
+!          call trans_ev_tridi_to_band_&
+!          &MATH_DATATYPE&
+!          &_&
+!          &PRECISION &
+!          (obj, na, nev, nblk, nbw, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), &
+!          q_dev, &
+!          ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
+!          nrThreads, success=success, kernel=kernel)
+!        endif
+!        print * , "After trans_ev_tridi_to_band: real part of q="
+!        do i=1,na
+!          write(*,"(100g15.5)") ( q(i,j), j=1,na )
+!        enddo
+! #ifdef DOUBLE_PRECISION_REAL
+!        call prmat(na,useGPU,q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'R',0)
+! #endif
+#ifdef HAVE_LIKWID
+       call likwid_markerStopRegion("trans_ev_to_band")
+#endif
+       call obj%timer%stop("trans_ev_to_band")
+
+       if (.not.(success)) return
+ 
+!        ! We can now deallocate the stored householder vectors
+!        deallocate(hh_trans, stat=istat, errmsg=errorMessage)
+!        if (istat .ne. 0) then
+!          print *, "solve_evp_&
+!          &MATH_DATATYPE&
+!          &_2stage_&
+!          &PRECISION " // ": error when deallocating hh_trans "//errorMessage
+!          stop 1
+!        endif
+     endif ! do_trans_to_band
+!      print *, 'after do_useGPU_trans_ev_tridi_to_band', do_useGPU_trans_ev_tridi_to_band
+!      print*, 'do_useGPU_trans_ev_band_to_full=', do_useGPU_trans_ev_band_to_full
+     ! the array q might reside on device or host, depending on whether GPU is
+     ! used or not. We thus have to transfer he data manually, if one of the
+     ! routines is run on GPU and the other not.
+
+     ! first deal with the situation that first backward step was on GPU
+     if(do_useGPU_trans_ev_tridi_to_band) then
+       ! if the second backward step is to be performed, but not on GPU, we have
+       ! to transfer q to the host
+       if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
+         successCUDA = cuda_memcpy(int(loc(q),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
+         if (.not.(successCUDA)) then
+           print *,"elpa2_template, error in copy to host"
+           stop 1
+         endif
+       endif
+
+       ! if the last step is not required at all, or will be performed on CPU,
+       ! release the memmory allocated on the device
+       if((.not. do_trans_to_full) .or. (.not. do_useGPU_trans_ev_band_to_full)) then
+         successCUDA = cuda_free(q_dev)
+         print *, 'q_dev is freed'
+       endif
+     endif
+
+     !TODO check that the memory is properly deallocated on the host in case that
+     !the last step is not required
+
+     if (do_trans_to_full) then
+       call obj%timer%start("trans_ev_to_full")
+#ifdef HAVE_LIKWID
+       call likwid_markerStartRegion("trans_ev_to_full")
+#endif
+       if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
+         ! copy to device if we want to continue on GPU
+ 
+         successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
+!          if (.not.(successCUDA)) then
+!            print *,"elpa2_template, error in cuda_malloc"
+!            stop 1
+!          endif         
+!          print *, 'q_dev=', q_dev, 'loc(q)=', loc(q)&
+!          , 'ldq*matrixCols* size_of_datatype=', ldq*matrixCols* size_of_datatype, ', q(1,1)=', q(1,1)
+
+         successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
+         if (.not.(successCUDA)) then
+           print *,"elpa2_template, error in copy to device", successCUDA
+           stop 1
+         endif
+       endif
+         
+       ! Backtransform stage 2
+       ! In the skew-symemtric case this transforms the real part
+       
+       call trans_ev_band_to_full_&
+       &MATH_DATATYPE&
+       &_&
+       &PRECISION &
+       (obj, na, nev, nblk, nbw, a, &
+       a_dev, lda, tmat, tmat_dev,  q,  &
+       q_dev, &
+       ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
+#if REALCASE == 1
+       , useQRActual  &
+#endif
+       )
+!        print * , "After trans_ev_band_to_full: real part of q="
+!        do i=1,na
+!          write(*,"(100g15.5)") ( q(i,j), j=1,na )
+!        enddo
+       call obj%timer%stop("trans_ev_to_full")
+     endif ! do_trans_to_full
+! #ifdef DOUBLE_PRECISION_REAL
+!        call prmat(na,useGPU,q(1:obj%local_nrows, 1:obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'R',1)
+! #endif
+!        New position:
+     if (do_trans_to_band) then
+       if (isSkewsymmetric) then
+         ! Transform imaginary part
+         ! Transformation of real and imaginary part could also be one call of trans_ev_tridi acting on the n x 2n matrix.
+           call trans_ev_tridi_to_band_&
+           &MATH_DATATYPE&
+           &_&
+           &PRECISION &
+           (obj, na, nev, nblk, nbw, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), &
+           q_dev, &
+           ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
+           nrThreads, success=success, kernel=kernel)
+         endif
+!          print * , "After trans_ev_tridi_to_band: imaginary part of q="
+!          do i=1,na
+!            write(*,"(100g15.5)") ( q(i,j+na), j=1,na )
+!          enddo
+! #ifdef DOUBLE_PRECISION_REAL
+!        call prmat(na,useGPU,q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'R',1)
+! #endif
+              ! We can now deallocate the stored householder vectors
+       deallocate(hh_trans, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *, "solve_evp_&
+         &MATH_DATATYPE&
+         &_2stage_&
+         &PRECISION " // ": error when deallocating hh_trans "//errorMessage
+         stop 1
+       endif
+     endif
+     if (isSkewsymmetric) then
+       ! first deal with the situation that first backward step was on GPU
+       if(do_useGPU_trans_ev_tridi_to_band) then
+          ! if the second backward step is to be performed, but not on GPU, we have
+          ! to transfer q to the host
+          if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
+            successCUDA = cuda_memcpy(loc(q(1,obj%local_ncols+1)), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
+            if (.not.(successCUDA)) then
+              print *,"elpa2_template, error in copy to host"
+              stop 1
+            endif
+          endif
+
+         ! if the last step is not required at all, or will be performed on CPU,
+         ! release the memmory allocated on the device
+         if((.not. do_trans_to_full) .or. (.not. do_useGPU_trans_ev_band_to_full)) then
+           successCUDA = cuda_free(q_dev)
+         endif
+       endif
+     endif
+     
+     if (do_trans_to_full) then
+       call obj%timer%start("trans_ev_to_full")  
+       if (isSkewsymmetric) then
+         if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
+           ! copy to device if we want to continue on GPU
+           successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
+!            if (.not.(successCUDA)) then
+!              print *,"elpa2_template, error in cuda_malloc"
+!              stop 1
+!            endif
+           successCUDA = cuda_memcpy(q_dev, loc(q(1,obj%local_ncols+1)), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
+           if (.not.(successCUDA)) then
+             print *,"elpa2_template, error in copy to device"
+             stop 1
+           endif
+         endif
+! #ifdef DOUBLE_PRECISION_REAL
+!          call prmat(na,useGPU,q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'I',0)
+! #endif
+         ! Transform imaginary part
+         ! Transformation of real and imaginary part could also be one call of trans_ev_band_to_full_ acting on the n x 2n matrix.
+
+         call trans_ev_band_to_full_&
+         &MATH_DATATYPE&
+         &_&
+         &PRECISION &
+         (obj, na, nev, nblk, nbw, a, &
+         a_dev, lda, tmat, tmat_dev,  q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),  &
+         q_dev, &
+         ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
+#if REALCASE == 1
+         , useQRActual  &
+#endif
+         )
+!          print * , "After trans_ev_band_to_full: imaginary part of q="
+!          do i=1,na
+!            write(*,"(100g15.5)") ( q(i,j+na), j=1,na )
+!          enddo
+! #ifdef DOUBLE_PRECISION_REAL
+!          call prmat(na,useGPU,q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'I',1)
+! #endif
+       endif
+
+       deallocate(tmat, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"solve_evp_&
+         &MATH_DATATYPE&
+         &_2stage_&
+         &PRECISION " // ": error when deallocating tmat"//errorMessage
+         stop 1
+       endif
+#ifdef HAVE_LIKWID
+       call likwid_markerStopRegion("trans_ev_to_full")
+#endif
+       call obj%timer%stop("trans_ev_to_full")
+     endif ! do_trans_to_full
+
+     if(do_bandred .or. do_trans_to_full) then
+       if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
+         successCUDA = cuda_free(tmat_dev)
+         if (.not.(successCUDA)) then
+           print *,"elpa2_template: error in cudaFree, tmat_dev"
+           stop 1
+         endif
+       endif
+     endif
+
+     if(do_useGPU .and. (a_dev .ne. 0)) then
+       successCUDA = cuda_free(a_dev)
+       if (.not.(successCUDA)) then
+         print *,"elpa2_template: error in cudaFree, a_dev"
+         stop 1
+       endif
+     endif
+
+
+     if (obj%eigenvalues_only) then
+       deallocate(q_dummy, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"solve_evp_&
+         &MATH_DATATYPE&
+         &_1stage_&
+         &PRECISION&
+         &" // ": error when deallocating q_dummy "//errorMessage
+         stop 1
+       endif
+     endif
+
+     ! restore original OpenMP settings
+#ifdef WITH_OPENMP
+    ! store the number of OpenMP threads used in the calling function
+    ! restore this at the end of ELPA 2
+    call omp_set_num_threads(omp_threads_caller)
+#endif
+
+     call obj%timer%stop("elpa_solve_evp_&
+     &MATH_DATATYPE&
+     &_2stage_&
+    &PRECISION&
+    &")
+1    format(a,f10.3)
+
+   end function elpa_solve_evp_&
+   &MATH_DATATYPE&
+   &_2stage_&
+   &PRECISION&
+   &_impl
+
+! vim: syntax=fortran
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_trans_ev_band_to_full_template.F90 elpa-2019.11.001/src/elpa2/elpa2_trans_ev_band_to_full_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_trans_ev_band_to_full_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_trans_ev_band_to_full_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,880 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#include "../general/sanity.F90"
+
+    subroutine trans_ev_band_to_full_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION &
+    (obj, na, nqc, nblk, nbw, a_mat, a_dev, lda, tmat, tmat_dev, q_mat, &
+     q_dev, ldq, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, useGPU &
+#if REALCASE == 1
+     ,useQr)
+#endif
+#if COMPLEXCASE == 1
+     )
+#endif
+
+    !-------------------------------------------------------------------------------
+    !  trans_ev_band_to_full_real/complex:
+    !  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
+    !
+    !  Parameters
+    !
+    !  na          Order of matrix a_mat, number of rows of matrix q_mat
+    !
+    !  nqc         Number of columns of matrix q_mat
+    !
+    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
+    !
+    !  nbw         semi bandwith
+    !
+    !  a_mat(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a_mat after bandred_real/complex)
+    !              Distribution is like in Scalapack.
+    !
+    !  lda         Leading dimension of a_mat
+    !  matrixCols  local columns of matrix a_mat and q_mat
+    !
+    !  tmat(nbw,nbw,numBlocks) Factors returned by bandred_real/complex
+    !
+    !  q_mat           On input: Eigenvectors of band matrix
+    !              On output: Transformed eigenvectors
+    !              Distribution is like in Scalapack.
+    !
+    !  ldq         Leading dimension of q_mat
+    !
+    !  mpi_comm_rows
+    !  mpi_comm_cols
+    !              MPI-Communicators for rows/columns
+    !
+    !-------------------------------------------------------------------------------
+      use precision
+      use cuda_functions
+      use iso_c_binding
+      use elpa_abstract_impl
+      use elpa_blas_interfaces
+
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      logical, intent(in)                    :: useGPU
+#if REALCASE == 1
+     logical, intent(in)                     :: useQR
+#endif
+      integer(kind=ik)                       :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck)               :: a_mat(lda,*)
+      MATH_DATATYPE(kind=rck)               :: q_mat(ldq,*), tmat(nbw,nbw,*)
+#else
+      MATH_DATATYPE(kind=rck)               :: a_mat(lda,matrixCols)
+      MATH_DATATYPE(kind=rck)               :: q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
+#endif
+      integer(kind=C_intptr_T)               :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
+
+      integer(kind=ik)                       :: my_prow, my_pcol, np_rows, np_cols
+      integer(kind=MPI_KIND)                 :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI, mpierr
+      integer(kind=ik)                       :: max_blocks_row, max_blocks_col, max_local_rows, &
+                                                max_local_cols
+      integer(kind=ik)                       :: l_cols, l_rows, l_colh, n_cols
+      integer(kind=ik)                       :: istep, lc, ncol, nrow, nb, ns
+
+      MATH_DATATYPE(kind=rck), allocatable   :: hvb(:)
+      MATH_DATATYPE(kind=rck), allocatable   ::  tmp1(:), tmp2(:), hvm(:,:)
+      ! hvm_dev is fist used and set in this routine
+      ! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
+      ! tmp_dev is first used in this routine
+      ! tmat_dev is passed along from bandred_real
+      integer(kind=C_intptr_T)               :: hvm_dev, q_dev, tmp_dev, tmat_dev
+
+      integer(kind=ik)                       :: i
+
+#ifdef BAND_TO_FULL_BLOCKING
+      MATH_DATATYPE(kind=rck), allocatable   :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
+      integer(kind=ik)                       :: cwy_blocking, t_blocking, t_cols, t_rows
+#endif
+
+      integer(kind=ik)                       :: istat
+      character(200)                         :: errorMessage
+      character(20)                          :: gpuString
+      logical                                :: successCUDA
+      integer(kind=c_intptr_t), parameter    :: size_of_datatype = size_of_&
+                                                                   &PRECISION&
+                                                                   &_&
+                                                                   &MATH_DATATYPE
+      integer                                :: blocking_factor, error
+
+      if(useGPU) then
+        gpuString = "_gpu"
+      else
+        gpuString = ""
+      endif
+
+      call obj%timer%start("trans_ev_band_to_full_&
+      &MATH_DATATYPE&
+      &" // &
+      &PRECISION_SUFFIX //&
+      gpuString)
+
+#ifdef BAND_TO_FULL_BLOCKING
+      call obj%get("blocking_in_band_to_full",blocking_factor,error)
+      if (error .ne. ELPA_OK) then
+        print *,"Problem getting option. Aborting..."
+        stop
+      endif
+#endif
+      call obj%timer%start("mpi_communication")
+
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
+
+      my_prow = int(my_prowMPI,kind=c_int)
+      my_pcol = int(my_pcolMPI,kind=c_int)
+      np_rows = int(np_rowsMPI,kind=c_int)
+      np_cols = int(np_colsMPI,kind=c_int)
+      call obj%timer%stop("mpi_communication")
+
+      max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of a_mat
+      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q_mat!
+
+      max_local_rows = max_blocks_row*nblk
+      max_local_cols = max_blocks_col*nblk
+
+      if (useGPU) then
+
+#if REALCASE == 1
+        ! here the GPU and CPU version diverged: the CPU version now always uses the useQR path which
+        ! is not implemented in the GPU version
+#endif
+
+        ! the GPU version does not (yet) support blocking
+        ! but the handling is the same for real/complex case
+
+        allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating tmp1 "//errorMessage
+          stop 1
+        endif
+
+        allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                   &MATH_DATATYPE&
+                   &: error when allocating tmp2 "//errorMessage
+          stop 1
+        endif
+
+        allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating hvb "//errorMessage
+          stop 1
+        endif
+
+        allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating hvm "//errorMessage
+          stop 1
+        endif
+
+        successCUDA = cuda_malloc(hvm_dev, (max_local_rows)*nbw* size_of_datatype)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMalloc"
+          stop 1
+        endif
+
+        successCUDA = cuda_malloc(tmp_dev, (max_local_cols)*nbw* size_of_datatype)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMalloc"
+          stop 1
+        endif
+
+!#ifdef WITH_MPI
+!! it should be possible to keep tmat dev on the device and not copy it around
+!! already existent on GPU
+!        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* &
+!#if REALCASE == 1
+!  size_of_PRECISION_real)
+!#endif
+!#if COMPLEXCASE == 1
+!        size_of_PRECISION_complex)
+!#endif
+!
+!        if (.not.(successCUDA)) then
+!          print *,"trans_ev_band_to_full_&
+!    &MATH_DATATYPE&
+!    &: error in cudaMalloc"
+!          stop 1
+!        endif
+!#endif
+
+#if REALCASE == 1
+! q_dev already living on device
+!        successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
+!        if (.not.(successCUDA)) then
+!          print *,"trans_ev_band_to_full_real: error in cudaMalloc"
+!          stop 1
+!        endif
+  !      q_temp(:,:) = 0.0
+  !      q_temp(1:ldq,1:na_cols) = q_mat(1:ldq,1:na_cols)
+
+!        ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
+!        successCUDA = cuda_memcpy(q_dev, c_loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
+!        if (.not.(successCUDA)) then
+!          print *,"trans_ev_band_to_full_real: error in cudaMalloc"
+!          stop 1
+!        endif
+#endif
+#if COMPLEXCASE == 1
+!         successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_complex)
+!         if (.not.(successCUDA)) then
+!           print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
+!           stop 1
+!         endif
+!
+!         successCUDA = cuda_memcpy(q_dev, c_loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
+!          if (.not.(successCUDA)) then
+!            print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
+!            stop 1
+!          endif
+#endif
+
+        ! if MPI is NOT used the following steps could be done on the GPU and memory transfers could be avoided
+        successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)* size_of_datatype)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMalloc"
+          stop 1
+        endif
+
+        hvm = 0.0_rck   ! Must be set to 0 !!!
+        hvb = 0.0_rck   ! Safety only
+        l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
+
+        do istep=1,(na-1)/nbw
+
+          n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
+
+          ! Broadcast all Householder vectors for current step compressed in hvb
+
+          nb = 0
+          ns = 0
+
+          do lc = 1, n_cols
+            ncol = istep*nbw + lc ! absolute column number of householder Vector
+            nrow = ncol - nbw ! absolute number of pivot row
+
+            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
+            l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number
+
+            if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
+
+            nb = nb+l_rows
+
+            if (lc==n_cols .or. mod(ncol,nblk)==0) then
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call MPI_Bcast(hvb(ns+1), int(nb-ns,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,&
+                             int(pcol(ncol, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+
+              call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+              ns = nb
+            endif
+          enddo
+
+          ! Expand compressed Householder vectors into matrix hvm
+
+          nb = 0
+          do lc = 1, n_cols
+            nrow = (istep-1)*nbw+lc ! absolute number of pivot row
+            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
+
+            hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
+            if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.0_rck
+            nb = nb+l_rows
+          enddo
+
+          successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm),kind=c_intptr_t), &
+                        max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
+
+          if (.not.(successCUDA)) then
+            print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm"
+            stop 1
+
+          endif
+
+          l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
+
+          ! Q = Q - V * T**T * V**T * Q
+
+          if (l_rows>0) then
+            call obj%timer%start("cublas")
+            call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',        &
+                                 n_cols, l_cols, l_rows, ONE, hvm_dev, max_local_rows, &
+                                       q_dev, ldq , ZERO, tmp_dev, n_cols)
+            call obj%timer%stop("cublas")
+
+#ifdef WITH_MPI
+
+            ! copy data from device to host for a later MPI_ALLREDUCE
+            ! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
+            successCUDA = cuda_memcpy(int(loc(tmp1),kind=c_intptr_t), &
+                          tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
+            if (.not.(successCUDA)) then
+              print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host"
+              stop 1
+            endif
+
+
+#else /* WITH_MPI */
+            ! in real case no copy needed. Don't do it in complex case neither
+#endif /* WITH_MPI */
+
+          else ! l_rows>0
+            tmp1(1:l_cols*n_cols) = 0.0_rck
+          endif ! l_rows>0
+
+#ifdef WITH_MPI
+          call obj%timer%start("mpi_communication")
+          call mpi_allreduce(tmp1, tmp2, int(n_cols*l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
+                             MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+          call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+!          tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
+#endif /* WITH_MPI */
+
+          if (l_rows>0) then
+#ifdef WITH_MPI
+            ! after the mpi_allreduce we have to copy back to the device
+            ! copy back to device
+            successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2),kind=c_intptr_t), &
+                          n_cols*l_cols* size_of_datatype, &
+              cudaMemcpyHostToDevice)
+            if (.not.(successCUDA)) then
+              print *,"trans_ev_band_to_full_&
+                      &MATH_DATATYPE&
+                      &: error in cudaMemcpy, tmp2"
+              stop 1
+            endif
+#else /* WITH_MPI */
+            ! in real case no memcopy needed. Don't do it in complex case neither
+#endif /* WITH_MPI */
+
+!#ifdef WITH_MPI
+           ! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
+           ! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
+           ! need to upload another slice
+           successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
+                         nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
+
+           if (.not.(successCUDA)) then
+             print *,"trans_ev_band_to_full_&
+                     &MATH_DATATYPE&
+                     &: error in cudaMemcpy, tmat"
+             stop 1
+           endif
+!#endif /* WITH_MPI */
+
+            call obj%timer%start("cublas")
+            call cublas_PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',       &
+                                        n_cols, l_cols, ONE, tmat_dev, nbw, tmp_dev, n_cols)
+
+            call cublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm_dev, max_local_rows, &
+                                       tmp_dev, n_cols, one, q_dev, ldq)
+            call obj%timer%stop("cublas")
+
+            ! copy to host maybe this can be avoided
+            ! this is not necessary hvm is not used anymore
+            successCUDA = cuda_memcpy(int(loc(hvm),kind=c_intptr_t), &
+                          hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
+            if (.not.(successCUDA)) then
+              print *,"trans_ev_band_to_full_real: error in cudaMemcpy hvm to host"
+              stop 1
+            endif
+          endif ! l_rows > 0
+
+        enddo ! istep
+
+
+
+      else ! do not useGPU
+
+#ifdef BAND_TO_FULL_BLOCKING
+        ! t_blocking was formerly 2; 3 is a better choice
+        t_blocking = blocking_factor ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
+
+        ! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked.
+!       if ( na >= ((t_blocking+1)*nbw) ) then
+        cwy_blocking = t_blocking * nbw
+
+        allocate(tmp1(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating tmp1 "//errorMessage
+          stop 1
+        endif
+
+        allocate(tmp2(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating tmp2 "//errorMessage
+          stop 1
+        endif
+
+        allocate(hvb(max_local_rows*cwy_blocking), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating hvb "//errorMessage
+          stop 1
+        endif
+
+        allocate(hvm(max_local_rows,cwy_blocking), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating hvm "//errorMessage
+          stop 1
+        endif
+
+#else /* BAND_TO_FULL_BLOCKING */
+
+        allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating tmp1 "//errorMessage
+          stop 1
+        endif
+
+        allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&: error when allocating tmp2 "//errorMessage
+          stop 1
+        endif
+
+        allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating hvb "//errorMessage
+          stop 1
+        endif
+
+        allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating hvm "//errorMessage
+          stop 1
+        endif
+#endif /* BAND_TO_FULL_BLOCKING */
+
+#ifdef BAND_TO_FULL_BLOCKING
+        allocate(tmat_complete(cwy_blocking,cwy_blocking), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                   &MATH_DATATYPE&
+                   &: error when allocating tmat_complete "//errorMessage
+          stop 1
+        endif
+        allocate(t_tmp(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating t_tmp "//errorMessage
+          stop 1
+        endif
+        allocate(t_tmp2(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when allocating t_tmp2 "//errorMessage
+          stop 1
+        endif
+#endif
+!        else
+!          allocate(tmp1(max_local_cols*nbw))
+!          allocate(tmp2(max_local_cols*nbw))
+!          allocate(hvb(max_local_rows*nbw))
+!          allocate(hvm(max_local_rows,nbw))
+!        endif
+
+        hvm = 0.0_rck   ! Must be set to 0 !!!
+        hvb = 0.0_rck   ! Safety only
+        l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
+
+!       if ( na >= ((t_blocking+1)*nbw) ) then
+
+#ifdef BAND_TO_FULL_BLOCKING
+        do istep=1,((na-1)/nbw-1)/t_blocking + 1
+#else
+        do istep=1,(na-1)/nbw
+#endif
+
+#ifdef BAND_TO_FULL_BLOCKING
+          ! This the call when using  na >= ((t_blocking+1)*nbw)
+          !      n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw
+          ! Number of columns in current step
+          ! As an alternative we add some special case handling if na < cwy_blocking
+          IF (na < cwy_blocking) THEN
+            n_cols = MAX(0, na-nbw)
+            IF ( n_cols .eq. 0 ) THEN
+              EXIT
+            END IF
+          ELSE
+            n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
+          END IF
+#else /* BAND_TO_FULL_BLOCKING */
+          n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
+#endif /* BAND_TO_FULL_BLOCKING */
+          ! Broadcast all Householder vectors for current step compressed in hvb
+
+          nb = 0
+          ns = 0
+
+          do lc = 1, n_cols
+#ifdef BAND_TO_FULL_BLOCKING
+            ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder Vector
+#else
+            ncol = istep*nbw + lc ! absolute column number of householder Vector
+#endif
+            nrow = ncol - nbw ! absolute number of pivot row
+
+            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
+            l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number
+
+            if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
+
+            nb = nb+l_rows
+
+            if (lc==n_cols .or. mod(ncol,nblk)==0) then
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+              call MPI_Bcast(hvb(ns+1), int(nb-ns,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
+                             int(pcol(ncol, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+
+              call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+              ns = nb
+            endif
+          enddo ! lc
+
+          ! Expand compressed Householder vectors into matrix hvm
+
+          nb = 0
+          do lc = 1, n_cols
+#ifdef BAND_TO_FULL_BLOCKING
+            nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
+#else
+            nrow = (istep-1)*nbw+lc ! absolute number of pivot row
+#endif
+            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
+
+            hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
+            if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.0_rck
+            nb = nb+l_rows
+          enddo
+
+#ifdef BAND_TO_FULL_BLOCKING
+          l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
+
+          ! compute tmat2 out of tmat(:,:,)
+          tmat_complete = 0
+          do i = 1, t_blocking
+            t_cols = MIN(nbw, n_cols - (i-1)*nbw)
+            if (t_cols <= 0) exit
+            t_rows = (i - 1) * nbw
+            tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
+
+            if (i > 1) then
+              call obj%timer%start("blas")
+              call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',      &
+                                  int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), &
+                                  ONE, hvm(1,1), int(max_local_rows,kind=BLAS_KIND), hvm(1,(i-1)*nbw+1), &
+                                  int(max_local_rows,kind=BLAS_KIND), ZERO, t_tmp, int(cwy_blocking,kind=BLAS_KIND) )
+
+              call obj%timer%stop("blas")
+#ifdef WITH_MPI
+              call obj%timer%start("mpi_communication")
+
+              call mpi_allreduce(t_tmp, t_tmp2, int(cwy_blocking*nbw,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION,    &
+                                 MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+              call obj%timer%stop("mpi_communication")
+              call obj%timer%start("blas")
+              call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), &
+                                  ONE, tmat_complete, int(cwy_blocking,kind=BLAS_KIND), t_tmp2, &
+                                  int(cwy_blocking,kind=BLAS_KIND) )
+              call PRECISION_TRMM('R', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), &
+                                  -ONE, tmat_complete(t_rows+1,t_rows+1), int(cwy_blocking,kind=BLAS_KIND), &
+                                  t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
+              call obj%timer%stop("blas")
+
+              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
+
+#else /* WITH_MPI */
+!              t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
+              call obj%timer%start("blas")
+              call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), &
+                                   ONE, tmat_complete, int(cwy_blocking,kind=BLAS_KIND), t_tmp, &
+                                   int(cwy_blocking,kind=BLAS_KIND))
+              call PRECISION_TRMM('R', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), &
+                                  -ONE, tmat_complete(t_rows+1,t_rows+1), int(cwy_blocking,kind=BLAS_KIND), &
+                                  t_tmp, int(cwy_blocking,kind=BLAS_KIND))
+              call obj%timer%stop("blas")
+
+              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
+
+#endif /* WITH_MPI */
+
+!              call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, ONE, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
+!              call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -ONE, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
+!                         t_tmp2, cwy_blocking)
+
+!              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
+             endif
+          enddo
+#else /* BAND_TO_FULL_BLOCKING */
+          l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
+#endif
+
+          ! Q = Q - V * T**T * V**T * Q
+
+          if (l_rows>0) then
+            call obj%timer%start("blas")
+
+            call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',         &
+                                int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), &
+                                ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), &
+                                q_mat, int(ldq,kind=BLAS_KIND), ZERO, tmp1, int(n_cols,kind=BLAS_KIND))
+            call obj%timer%stop("blas")
+
+          else ! l_rows>0
+
+            tmp1(1:l_cols*n_cols) = 0.0_rck
+          endif ! l_rows>0
+
+#ifdef WITH_MPI
+          call obj%timer%start("mpi_communication")
+          call mpi_allreduce(tmp1, tmp2, int(n_cols*l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, &
+                             int(mpi_comm_rows,kind=MPI_KIND) ,mpierr)
+          call obj%timer%stop("mpi_communication")
+
+          call obj%timer%start("blas")
+
+          if (l_rows>0) then
+#ifdef BAND_TO_FULL_BLOCKING
+
+            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
+                                int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), ONE, tmat_complete, &
+                                int(cwy_blocking,kind=BLAS_KIND), tmp2, int(n_cols,kind=BLAS_KIND))
+            call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
+                                -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), tmp2, int(n_cols,kind=BLAS_KIND), &
+                                 ONE, q_mat, int(ldq,kind=BLAS_KIND))
+
+#else /* BAND_TO_FULL_BLOCKING */
+
+            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
+                                int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), ONE, tmat(1,1,istep), &
+                                int(ubound(tmat,dim=1),kind=BLAS_KIND), tmp2, int(n_cols,kind=BLAS_KIND))
+            call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
+                                int(n_cols,kind=BLAS_KIND), -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), &
+                                tmp2, int(n_cols,kind=BLAS_KIND), ONE, q_mat, int(ldq,kind=BLAS_KIND))
+
+#endif /* BAND_TO_FULL_BLOCKING */
+
+          endif
+          call obj%timer%stop("blas")
+#else /* WITH_MPI */
+!          tmp2 = tmp1
+          call obj%timer%start("blas")
+          if (l_rows>0) then
+#ifdef BAND_TO_FULL_BLOCKING
+            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
+                                int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), ONE, tmat_complete, &
+                                int(cwy_blocking,kind=BLAS_KIND), tmp1, int(n_cols,kind=BLAS_KIND))
+            call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
+                                int(n_cols,kind=BLAS_KIND), -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), &
+                                tmp1, int(n_cols,kind=BLAS_KIND), ONE, q_mat, int(ldq,kind=BLAS_KIND))
+#else /* BAND_TO_FULL_BLOCKING */
+
+            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
+                                int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), ONE, tmat(1,1,istep), &
+                                int(ubound(tmat,dim=1),kind=BLAS_KIND), tmp1, int(n_cols,kind=BLAS_KIND))
+            call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
+                                int(n_cols,kind=BLAS_KIND), -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), &
+                                tmp1, int(n_cols,kind=BLAS_KIND), ONE, q_mat, int(ldq,kind=BLAS_KIND))
+
+#endif  /* BAND_TO_FULL_BLOCKING */
+          endif
+          call obj%timer%stop("blas")
+#endif /* WITH_MPI */
+
+!          if (l_rows>0) then
+!            call PRECISION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, ONE, tmat_complete, cwy_blocking, tmp2, n_cols)
+!            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), tmp2, n_cols, ONE, q_mat, ldq)
+!          endif
+
+        enddo ! istep
+
+      endif ! useGPU
+
+      deallocate(tmp1, tmp2, hvb, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"trans_ev_band_to_full_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating tmp1 tmp2 hvb "//errorMessage
+        stop 1
+      endif
+
+      if (useGPU) then
+        successCUDA = cuda_free(hvm_dev)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error in cudaFree"
+          stop 1
+        endif
+
+        successCUDA = cuda_free(tmp_dev)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error in cudaFree"
+          stop 1
+        endif
+
+
+         ! final transfer of q_dev
+         successCUDA = cuda_memcpy(int(loc(q_mat),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, &
+                       cudaMemcpyDeviceToHost)
+
+         if (.not.(successCUDA)) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error in cudamemcpu q_dev"
+          stop 1
+         endif
+
+         !   q_mat(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)
+
+         successCUDA = cuda_free(q_dev)
+         if (.not.(successCUDA)) then
+           print *,"trans_ev_band_to_full_&
+                   &MATH_DATATYPE&
+                   &: error in cudaFree"
+           stop 1
+         endif
+
+         !   deallocate(q_temp, stat=istat, errmsg=errorMessage)
+         !   if (istat .ne. 0) then
+         !     print *,"error when deallocating q_temp "//errorMessage
+         !     stop 1
+         !   endif
+         !   deallocate(tmat_temp, stat=istat, errmsg=errorMessage)
+         !   if (istat .ne. 0) then
+         !     print *,"trans_ev_band_to_full_real: error when deallocating tmat_temp "//errorMessage
+         !     stop 1
+         !   endif
+
+      endif ! useGPU
+
+      deallocate(hvm, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"trans_ev_band_to_full_&
+                &MATH_DATATYPE&
+                &: error when deallocating hvm "//errorMessage
+        stop 1
+      endif
+
+#if BAND_TO_FULL_BLOCKING
+      if (.not.(useGPU)) then
+        deallocate(tmat_complete, t_tmp, t_tmp2, stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_band_to_full_&
+                  &MATH_DATATYPE&
+                  &: error when deallocating tmat_complete, t_tmp, t_tmp2 "//errorMessage
+          stop 1
+        endif
+      endif
+#endif
+
+      call obj%timer%stop("trans_ev_band_to_full_&
+      &MATH_DATATYPE&
+      &" // &
+      &PRECISION_SUFFIX //&
+      gpuString)
+
+    end subroutine trans_ev_band_to_full_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION
+
+
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 elpa-2019.11.001/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,2465 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+#include "../general/sanity.F90"
+
+  subroutine trans_ev_tridi_to_band_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION &
+    (obj, na, nev, nblk, nbw, q, q_dev, ldq, matrixCols,         &
+     hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, max_threads, success, &
+     kernel)
+
+    !-------------------------------------------------------------------------------
+    !  trans_ev_tridi_to_band_real/complex:
+    !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
+    !
+    !  Parameters
+    !
+    !  na          Order of matrix a, number of rows of matrix q
+    !
+    !  nev         Number eigenvectors to compute (= columns of matrix q)
+    !
+    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
+    !
+    !  nb          semi bandwith
+    !
+    !  q           On input: Eigenvectors of tridiagonal matrix
+    !              On output: Transformed eigenvectors
+    !              Distribution is like in Scalapack.
+    !
+    !  q_dev       GPU device pointer to q
+    !
+    !  ldq         Leading dimension of q
+    !  matrixCols  local columns of matrix q
+    !
+    !  mpi_comm_rows
+    !  mpi_comm_cols
+    !              MPI-Communicators for rows/columns/both
+    !
+    !-------------------------------------------------------------------------------
+      use elpa_abstract_impl
+      use elpa2_workload
+      use pack_unpack_cpu
+      use pack_unpack_gpu
+      use compute_hh_trafo
+      use cuda_functions
+      use precision
+      use iso_c_binding
+#ifdef WITH_OPENMP
+      ! use omp_lib
+#endif
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      logical, intent(in)                        :: useGPU
+
+      integer(kind=ik), intent(in)               :: kernel
+      integer(kind=ik), intent(in)               :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck)                    :: q(ldq,*)
+#else
+      MATH_DATATYPE(kind=rck)                    :: q(ldq,matrixCols)
+#endif
+
+      MATH_DATATYPE(kind=rck), intent(in)        :: hh_trans(:,:)
+      integer(kind=c_intptr_t)                   :: q_dev
+
+      integer(kind=ik)                           :: np_rows, my_prow, np_cols, my_pcol
+      integer(kind=MPI_KIND)                     :: np_rowsMPI, my_prowMPI, np_colsMPI, my_pcolMPI
+      integer(kind=ik)                           :: i, j, ip, sweep, nbuf, l_nev, a_dim2
+      integer(kind=ik)                           :: current_n, current_local_n, current_n_start, current_n_end
+      integer(kind=ik)                           :: next_n, next_local_n, next_n_start, next_n_end
+      integer(kind=ik)                           :: bottom_msg_length, top_msg_length, next_top_msg_length
+      integer(kind=ik)                           :: stripe_width, last_stripe_width, stripe_count
+#ifdef WITH_OPENMP
+      integer(kind=ik)                           :: thread_width, csw, b_off, b_len
+#endif
+      integer(kind=ik)                           :: num_result_blocks, num_result_buffers, num_bufs_recvd
+      integer(kind=ik)                           :: a_off, current_tv_off, max_blk_size
+      integer(kind=ik)                           :: src, src_offset, dst, offset, nfact, num_blk
+      integer(kind=MPI_KIND)                     :: mpierr
+
+      logical                                    :: flag
+#ifdef WITH_OPENMP
+      MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:,:)
+#else
+      MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:)
+#endif
+      MATH_DATATYPE(kind=rck)                    :: a_var
+
+      type(c_ptr)                                :: aIntern_ptr
+
+      MATH_DATATYPE(kind=rck)   , allocatable    :: row(:)
+      MATH_DATATYPE(kind=rck)   , allocatable    :: row_group(:,:)
+
+#ifdef WITH_OPENMP
+      MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:)
+      MATH_DATATYPE(kind=rck), allocatable       :: top_border_recv_buffer(:,:)
+      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:)
+      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_recv_buffer(:,:)
+#else
+      MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:,:)
+      MATH_DATATYPE(kind=rck), allocatable       :: top_border_recv_buffer(:,:,:)
+      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:,:)
+      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_recv_buffer(:,:,:)
+#endif
+
+      integer(kind=c_intptr_t)                   :: aIntern_dev
+      integer(kind=c_intptr_t)                   :: bcast_buffer_dev
+      integer(kind=c_intptr_t)                   :: num
+      integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1
+      integer(kind=c_intptr_t)                   :: row_dev
+      integer(kind=c_intptr_t)                   :: row_group_dev
+      integer(kind=c_intptr_t)                   :: hh_tau_dev
+      integer(kind=c_intptr_t)                   :: hh_dot_dev
+      integer(kind=ik)                           :: row_group_size, unpack_idx
+
+      integer(kind=ik)                           :: n_times
+      integer(kind=ik)                           :: top, chunk, this_chunk
+
+      MATH_DATATYPE(kind=rck), allocatable       :: result_buffer(:,:,:)
+      MATH_DATATYPE(kind=rck), allocatable       :: bcast_buffer(:,:)
+
+      integer(kind=ik)                           :: n_off
+
+      integer(kind=MPI_KIND), allocatable        :: result_send_request(:), result_recv_request(:)
+      integer(kind=ik), allocatable              :: limits(:)
+      integer(kind=MPI_KIND), allocatable        :: top_send_request(:), bottom_send_request(:)
+      integer(kind=MPI_KIND), allocatable        :: top_recv_request(:), bottom_recv_request(:)
+
+      ! MPI send/recv tags, arbitrary
+
+      integer(kind=ik), parameter                :: bottom_recv_tag = 111
+      integer(kind=ik), parameter                :: top_recv_tag    = 222
+      integer(kind=ik), parameter                :: result_recv_tag = 333
+
+      integer(kind=ik), intent(in)               :: max_threads
+ 
+#ifdef WITH_OPENMP
+      integer(kind=ik)                           :: my_thread
+#endif
+
+
+      ! Just for measuring the kernel performance
+      real(kind=c_double)                        :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
+      ! long integer
+      integer(kind=lik)                          :: kernel_flops, kernel_flops_recv
+
+      logical, intent(in)                        :: wantDebug
+      logical                                    :: success
+      integer(kind=ik)                           :: istat, print_flops
+      character(200)                             :: errorMessage
+      character(20)                              :: gpuString
+      logical                                    :: successCUDA
+#ifndef WITH_MPI
+      integer(kind=ik)                           :: j1
+#endif
+      integer(kind=ik)                           :: error
+      integer(kind=c_intptr_t), parameter        :: size_of_datatype = size_of_&
+                                                                     &PRECISION&
+                                                                     &_&
+                                                                     &MATH_DATATYPE
+
+      if(useGPU) then
+        gpuString = "_gpu"
+      else
+        gpuString = ""
+      endif
+
+      call obj%timer%start("trans_ev_tridi_to_band_&
+      &MATH_DATATYPE&
+      &" // &
+      &PRECISION_SUFFIX //&
+      gpuString)
+
+      n_times = 0
+      if (useGPU) then
+        unpack_idx = 0
+        row_group_size = 0
+      endif
+
+      success = .true.
+      kernel_time = 0.0
+      kernel_flops = 0
+
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      call MPI_Comm_rank(int(mpi_comm_rows,kind=MPI_KIND) , my_prowMPI , mpierr)
+      call MPI_Comm_size(int(mpi_comm_rows,kind=MPI_KIND) , np_rowsMPI , mpierr)
+      call MPI_Comm_rank(int(mpi_comm_cols,kind=MPI_KIND) , my_pcolMPI , mpierr)
+      call MPI_Comm_size(int(mpi_comm_cols,kind=MPI_KIND) , np_colsMPI , mpierr)
+
+      my_prow = int(my_prowMPI,kind=c_int)
+      my_pcol = int(my_pcolMPI,kind=c_int)
+      np_rows = int(np_rowsMPI,kind=c_int)
+      np_cols = int(np_colsMPI,kind=c_int)
+
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+
+      if (mod(nbw,nblk)/=0) then
+        if (my_prow==0 .and. my_pcol==0) then
+          if (wantDebug) then
+            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
+                                &MATH_DATATYPE&
+                                &: ERROR: nbw=',nbw,', nblk=',nblk
+            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
+                                &MATH_DATATYPE&
+                                &: band backtransform works only for nbw==n*nblk'
+          endif
+          success = .false.
+          return
+        endif
+      endif
+
+      nfact = nbw / nblk
+
+
+      ! local number of eigenvectors
+      l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
+
+      if (l_nev==0) then
+#ifdef WITH_OPENMP
+        thread_width = 0
+#endif
+        stripe_width = 0
+        stripe_count = 0
+        last_stripe_width = 0
+
+      else ! l_nev
+
+#if WITH_OPENMP
+        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
+        ! every primary cache
+        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
+
+        if (useGPU) then
+          stripe_width = 256 ! Must be a multiple of 4
+          stripe_count = (l_nev - 1) / stripe_width + 1
+        else ! useGPU
+          ! openmp only in non-GPU case
+          thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
+
+#if REALCASE == 1
+          call obj%get("stripewidth_real",stripe_width, error)
+
+#ifdef DOUBLE_PRECISION_REAL
+          !stripe_width = 48 ! Must be a multiple of 4
+#else
+          stripe_width = stripe_width * 2
+          !stripe_width = 96 ! Must be a multiple of 8
+#endif
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+          call obj%get("stripewidth_complex",stripe_width, error)
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+          !stripe_width = 48 ! Must be a multiple of 2
+#else
+          stripe_width = stripe_width * 2
+          !stripe_width = 48 ! Must be a multiple of 4
+#endif
+#endif /* COMPLEXCASE */
+
+          stripe_count = (thread_width-1)/stripe_width + 1
+
+          ! Adapt stripe width so that last one doesn't get too small
+
+          stripe_width = (thread_width-1)/stripe_count + 1
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
+              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
+              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
+
+            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
+                                                  ! (8 * sizeof(double) == 64)
+
+    else
+            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
+                                            ! (4 * sizeof(double) == 32)
+          endif
+#else
+          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
+              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
+              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
+
+
+            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
+                                               ! (16 * sizeof(float) == 64)
+
+    else
+            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
+                                            ! (8 * sizeof(float) == 32)
+          endif
+#endif
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
+              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
+
+            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
+                                            ! (4 * sizeof(double complex) == 64)
+
+    else
+
+            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
+                                            ! (2 * sizeof(double complex) == 32)
+    endif
+#else
+
+          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
+              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
+
+            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
+                                            ! (8 * sizeof(float complex) == 64)
+
+          else
+            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
+                                            ! (4 * sizeof(float complex) == 32)
+    endif
+#endif
+#endif /* COMPLEXCASE */
+
+#if REALCASE == 1
+        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
+#endif
+#if COMPLEXCASE == 1
+! only needed in no OMP case check thsis
+! last_stripe_width = l_nev - (stripe_count-1)*stripe_width
+#endif
+
+        endif ! useGPU
+
+#else /* WITH_OPENMP */
+
+        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
+        ! every primary cache
+        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
+
+        if (useGPU) then
+          stripe_width = 256 ! Must be a multiple of 4
+          stripe_count = (l_nev - 1) / stripe_width + 1
+
+        else ! useGPU
+#if REALCASE == 1
+          call obj%get("stripewidth_real",stripe_width, error)
+
+#ifdef DOUBLE_PRECISION_REAL
+          !stripe_width = 48 ! Must be a multiple of 4
+#else
+          !stripe_width = 96 ! Must be a multiple of 8
+          stripe_width = 2 * stripe_width
+#endif
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+          call obj%get("stripewidth_complex",stripe_width, error)
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+          !stripe_width = 48 ! Must be a multiple of 2
+#else
+          !stripe_width = 48 ! Must be a multiple of 4
+#endif
+#endif /* COMPLEXCASE */
+
+          stripe_count = (l_nev-1)/stripe_width + 1
+
+          ! Adapt stripe width so that last one doesn't get too small
+
+          stripe_width = (l_nev-1)/stripe_count + 1
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
+              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
+              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
+
+            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
+                                                  ! (8 * sizeof(double) == 64)
+
+    else
+            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
+                                            ! (4 * sizeof(double) == 32)
+          endif
+#else
+          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
+              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
+              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
+
+
+            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
+                                               ! (16 * sizeof(float) == 64)
+
+    else
+            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
+                                            ! (8 * sizeof(float) == 32)
+          endif
+#endif
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
+              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
+
+            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
+                                            ! (4 * sizeof(double complex) == 64)
+
+    else
+
+            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
+                                            ! (2 * sizeof(double complex) == 32)
+    endif
+#else
+
+          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
+              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
+
+            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
+                                            ! (8 * sizeof(float complex) == 64)
+
+          else
+            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
+                                            ! (4 * sizeof(float complex) == 32)
+    endif
+#endif
+#endif /* COMPLEXCASE */
+        endif ! useGPU
+
+        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
+
+#endif /* WITH_OPENMP */
+      endif ! l_nev
+
+      ! Determine the matrix distribution at the beginning
+
+      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"trans_ev_tridi_to_band_&
+                &MATH_DATATYPE&
+                &: error when allocating limits"//errorMessage
+        stop 1
+      endif
+      call determine_workload(obj,na, nbw, np_rows, limits)
+
+      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
+
+      a_dim2 = max_blk_size + nbw
+
+      if (useGPU) then
+        num =  (stripe_width*a_dim2*stripe_count)* size_of_datatype
+        successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMalloc"//errorMessage
+          stop 1
+        endif
+
+        successCUDA = cuda_memset(aIntern_dev , 0, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMemset"//errorMessage
+          stop 1
+        endif
+
+        num =  (l_nev)* size_of_datatype
+        successCUDA = cuda_malloc( row_dev,num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMalloc "
+          stop 1
+        endif
+
+        successCUDA = cuda_memset(row_dev , 0, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMemset "
+          stop 1
+        endif
+
+        ! "row_group" and "row_group_dev" are needed for GPU optimizations
+        allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
+        if (istat .ne. 0) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error when allocating row_group"//errorMessage
+          stop 1
+        endif
+
+        row_group(:, :) = 0.0_rck
+        num =  (l_nev*nblk)* size_of_datatype
+        successCUDA = cuda_malloc(row_group_dev, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMalloc"//errorMessage
+          stop 1
+        endif
+
+        successCUDA = cuda_memset(row_group_dev , 0, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMemset"//errorMessage
+          stop 1
+        endif
+
+      else ! GPUs are not used
+
+#if 0
+! realcase or complexcase
+!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
+#endif
+
+#ifdef WITH_OPENMP
+        if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*max_threads*     &
+               C_SIZEOF(a_var)) /= 0) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error when allocating aIntern"//errorMessage
+          stop 1
+        endif
+
+        call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
+        ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
+
+        ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
+
+#else /* WITH_OPENMP */
+
+        if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*  &
+            C_SIZEOF(a_var)) /= 0) then
+          print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
+          stop 1
+        endif
+
+        call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
+        !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
+
+        aIntern(:,:,:) = 0.0_rck
+#endif /* WITH_OPENMP */
+      endif !useGPU
+
+      allocate(row(l_nev), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"trans_ev_tridi_to_band_&
+        &MATH_DATATYPE&
+        &: error when allocating row"//errorMessage
+        stop 1
+      endif
+
+      row(:) = 0.0_rck
+
+      ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
+      ! and transpose the matrix using stripes of given stripe_width for cache blocking.
+
+      ! The peculiar way it is done below is due to the fact that the last row should be
+      ! ready first since it is the first one to start below
+
+#ifdef WITH_OPENMP
+      ! Please note about the OMP usage below:
+      ! This is not for speed, but because we want the matrix a in the memory and
+      ! in the cache of the correct thread (if possible)
+
+      call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+      !$omp parallel do private(my_thread), schedule(static, 1)
+      do my_thread = 1, max_threads
+        aIntern(:,:,:,my_thread) = 0.0_rck ! if possible, do first touch allocation!
+      enddo
+      !$omp end parallel do
+
+      call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+#endif /* WITH_OPENMP */
+
+      do ip = np_rows-1, 0, -1
+        if (my_prow == ip) then
+          ! Receive my rows which have not yet been received
+          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
+          do i=limits(ip)+1,limits(ip+1)
+            src = mod((i-1)/nblk, np_rows)
+
+            if (src < my_prow) then
+#ifdef WITH_OPENMP
+
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+              call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                            int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+
+!              row(1:l_nev) = row(1:l_nev)
+
+#endif /* WITH_MPI */
+
+              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+!$omp parallel do private(my_thread), schedule(static, 1)
+              do my_thread = 1, max_threads
+                call unpack_row_&
+                     &MATH_DATATYPE&
+                     &_cpu_openmp_&
+                     &PRECISION &
+                                  (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, &
+                                   thread_width, stripe_width, l_nev)
+
+              enddo
+!$omp end parallel do
+
+              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+#else /* WITH_OPENMP */
+              if (useGPU) then
+                ! An unpacking of the current row group may occur before queuing the next row
+                call unpack_and_prepare_row_group_&
+                &MATH_DATATYPE&
+                &_gpu_&
+                &PRECISION &
+                              ( &
+                              row_group, row_group_dev, aIntern_dev, stripe_count, &
+                                          stripe_width, last_stripe_width, a_dim2, l_nev,&
+                                          row_group_size, nblk, unpack_idx, &
+                                           i - limits(ip), .false.)
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call MPI_Recv(row_group(:, row_group_size), int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                              int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+                row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
+#endif /* WITH_MPI */
+
+              else ! useGPU
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                              int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+
+!                row(1:l_nev) = row(1:l_nev)
+
+#endif /* WITH_MPI */
+
+                call unpack_row_&
+                     &MATH_DATATYPE&
+                     &_cpu_&
+                     &PRECISION &
+                                (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
+              endif ! useGPU
+#endif /* WITH_OPENMP */
+
+            elseif (src == my_prow) then
+
+              src_offset = src_offset+1
+
+              if (useGPU) then
+#ifndef WITH_OPENMP
+
+                 ! An unpacking of the current row group may occur before queuing the next row
+                 call unpack_and_prepare_row_group_&
+                      &MATH_DATATYPE&
+                      &_gpu_&
+                      &PRECISION &
+                  ( &
+                               row_group, row_group_dev, aIntern_dev, stripe_count, &
+                               stripe_width, last_stripe_width, a_dim2, l_nev,&
+                               row_group_size, nblk, unpack_idx, &
+                               i - limits(ip), .false.)
+
+                row_group(:, row_group_size) = q(src_offset, 1:l_nev)
+#else /* WITH_OPENMP */
+
+!#if COMPLEXCASE == 1
+!! why is an cuda call in the openmp region?
+!                call unpack_and_prepare_row_group_complex_gpu_&
+!                     &PRECISION&
+!                     &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
+!                      last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
+!                      unpack_idx, i - limits(ip),.false.)
+!                      row_group(:, row_group_size) = q(src_offset, 1:l_nev)
+!#endif
+
+#endif /* not OpenMP */
+              else
+                row(:) = q(src_offset, 1:l_nev)
+              endif
+
+#ifdef WITH_OPENMP
+              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+!$omp parallel do private(my_thread), schedule(static, 1)
+              do my_thread = 1, max_threads
+                call unpack_row_&
+                     &MATH_DATATYPE&
+                     &_cpu_openmp_&
+                     &PRECISION &
+                                   (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
+
+              enddo
+!$omp end parallel do
+
+              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+#else /* WITH_OPENMP */
+
+              if (useGPU) then
+
+              else
+                call unpack_row_&
+                     &MATH_DATATYPE&
+                     &_cpu_&
+                     &PRECISION &
+                                (obj,aIntern, row,i-limits(ip),  stripe_count, stripe_width, last_stripe_width)
+              endif
+
+#endif /* WITH_OPENMP */
+
+            endif
+          enddo
+
+          ! Send all rows which have not yet been send
+          src_offset = 0
+          do dst = 0, ip-1
+            do i=limits(dst)+1,limits(dst+1)
+              if (mod((i-1)/nblk, np_rows) == my_prow) then
+                src_offset = src_offset+1
+                row(:) = q(src_offset, 1:l_nev)
+
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call MPI_Send(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                              int(dst,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+              endif
+            enddo
+          enddo
+
+        else if (my_prow < ip) then
+
+          ! Send all rows going to PE ip
+          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
+          do i=limits(ip)+1,limits(ip+1)
+            src = mod((i-1)/nblk, np_rows)
+            if (src == my_prow) then
+              src_offset = src_offset+1
+              row(:) = q(src_offset, 1:l_nev)
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+              call MPI_Send(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                            int(ip,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+            endif
+          enddo
+
+          ! Receive all rows from PE ip
+          do i=limits(my_prow)+1,limits(my_prow+1)
+            src = mod((i-1)/nblk, np_rows)
+            if (src == ip) then
+#ifdef WITH_OPENMP
+
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+              call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                            int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+
+!              row(1:l_nev) = row(1:l_nev)
+
+#endif /* WITH_MPI */
+
+              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+!$omp parallel do private(my_thread), schedule(static, 1)
+              do my_thread = 1, max_threads
+                call unpack_row_&
+                     &MATH_DATATYPE&
+                     &_cpu_openmp_&
+                     &PRECISION &
+                                 (obj,aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
+              enddo
+!$omp end parallel do
+              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+#else /* WITH_OPENMP */
+              if (useGPU) then
+                ! An unpacking of the current row group may occur before queuing the next row
+                call unpack_and_prepare_row_group_&
+                     &MATH_DATATYPE&
+                     &_gpu_&
+                     &PRECISION&
+                     &( &
+                  row_group, row_group_dev, aIntern_dev, stripe_count,  &
+                  stripe_width, last_stripe_width, a_dim2, l_nev,       &
+                  row_group_size, nblk, unpack_idx,                     &
+                  i - limits(my_prow), .false.)
+
+#ifdef WITH_MPI
+               if (wantDebug) call obj%timer%start("mpi_communication")
+               call MPI_Recv(row_group(:, row_group_size), int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                             int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+               if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+
+               row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
+#endif /* WITH_MPI */
+
+              else ! useGPU
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                              int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+
+!                row(1:l_nev) = row(1:l_nev)
+
+#endif
+                call unpack_row_&
+                     &MATH_DATATYPE&
+                     &_cpu_&
+                     &PRECISION &
+                                (obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
+              endif ! useGPU
+
+#endif /* WITH_OPENMP */
+
+            endif
+          enddo
+        endif
+      enddo
+
+      if (useGPU) then
+        ! Force an unpacking of all remaining rows that haven't been unpacked yet
+        call unpack_and_prepare_row_group_&
+             &MATH_DATATYPE&
+             &_gpu_&
+             &PRECISION&
+             &( &
+          row_group, row_group_dev, aIntern_dev, stripe_count, &
+          stripe_width, last_stripe_width, &
+          a_dim2, l_nev, row_group_size, nblk, unpack_idx,     &
+          -1, .true.)
+
+        successCUDA = cuda_devicesynchronize()
+
+         if (.not.(successCUDA)) then
+           print *,"trans_ev_tridi_to_band_&
+           &MATH_DATATYPE&
+           &: error in cudaDeviceSynchronize"//errorMessage
+           stop 1
+         endif
+      endif
+
+      ! Set up result buffer queue
+
+      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
+
+      num_result_buffers = 4*nfact
+      allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"trans_ev_tridi_to_band_&
+        &MATH_DATATYPE&
+        &: error when allocating result_buffer"//errorMessage
+        stop 1
+      endif
+
+      allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"trans_ev_tridi_to_band_&
+        &MATH_DATATYPE&
+        &: error when allocating result_send_request"//errorMessage
+        stop 1
+      endif
+
+      allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"trans_ev_tridi_to_band_&
+        &MATH_DATATYPE&
+        &: error when allocating result_recv_request"//errorMessage
+        stop 1
+      endif
+
+#ifdef WITH_MPI
+      result_send_request(:) = MPI_REQUEST_NULL
+      result_recv_request(:) = MPI_REQUEST_NULL
+#endif
+
+      ! Queue up buffers
+#ifdef WITH_MPI
+      if (wantDebug) call obj%timer%start("mpi_communication")
+
+      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
+        do j = 1, min(num_result_buffers, num_result_blocks)
+          call MPI_Irecv(result_buffer(1,1,j), int(l_nev*nblk,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL,     &
+                         0_MPI_KIND, int(result_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND),          &
+                         result_recv_request(j), mpierr)
+        enddo
+      endif
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+
+      ! carefull the "recv" has to be done at the corresponding wait or send
+      ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)
+
+#endif /* WITH_MPI */
+
+      num_bufs_recvd = 0 ! No buffers received yet
+
+      ! Initialize top/bottom requests
+
+      allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MPI_DATATYPE&
+         &: error when allocating top_send_request"//errorMessage
+         stop 1
+       endif
+
+      allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating top_recv_request"//errorMessage
+         stop 1
+       endif
+
+      allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating bottom_send_request"//errorMessage
+         stop 1
+       endif
+
+      allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating bottom_recv_request"//errorMessage
+         stop 1
+       endif
+
+#ifdef WITH_MPI
+      top_send_request(:) = MPI_REQUEST_NULL
+      top_recv_request(:) = MPI_REQUEST_NULL
+      bottom_send_request(:) = MPI_REQUEST_NULL
+      bottom_recv_request(:) = MPI_REQUEST_NULL
+#endif
+
+#ifdef WITH_OPENMP
+      allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating top_border_send_buffer"//errorMessage
+         stop 1
+       endif
+
+      allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating top_border_recv_buffer"//errorMessage
+         stop 1
+       endif
+
+      allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating bottom_border_send_buffer"//errorMessage
+         stop 1
+       endif
+
+      allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating bottom_border_recv_buffer"//errorMessage
+         stop 1
+       endif
+
+      top_border_send_buffer(:,:) = 0.0_rck
+      top_border_recv_buffer(:,:) = 0.0_rck
+      bottom_border_send_buffer(:,:) = 0.0_rck
+      bottom_border_recv_buffer(:,:) = 0.0_rck
+      ! Initialize broadcast buffer
+
+#else /* WITH_OPENMP */
+
+       allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating top_border_send_bufer"//errorMessage
+         stop 1
+       endif
+
+      allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating top_border_recv_buffer"//errorMessage
+         stop 1
+       endif
+
+      allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating bottom_border_send_buffer"//errorMessage
+         stop 1
+       endif
+
+      allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating bottom_border_recv_buffer"//errorMessage
+         stop 1
+       endif
+
+      top_border_send_buffer(:,:,:) = 0.0_rck
+      top_border_recv_buffer(:,:,:) = 0.0_rck
+      bottom_border_send_buffer(:,:,:) = 0.0_rck
+      bottom_border_recv_buffer(:,:,:) = 0.0_rck
+#endif /* WITH_OPENMP */
+
+      ! Initialize broadcast buffer
+
+      allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+         &MATH_DATATYPE&
+         &: error when allocating bcast_buffer"//errorMessage
+         stop 1
+       endif
+
+      bcast_buffer = 0.0_rck
+      if (useGPU) then
+        num =  ( nbw * max_blk_size) * size_of_datatype
+        successCUDA = cuda_malloc(bcast_buffer_dev, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMalloc"
+          stop 1
+        endif
+
+        successCUDA = cuda_memset( bcast_buffer_dev, 0, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMemset"
+          stop 1
+        endif
+
+        num =  ((max_blk_size-1))* size_of_datatype
+        successCUDA = cuda_malloc( hh_dot_dev, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMalloc"
+          stop 1
+        endif
+
+        successCUDA = cuda_memset( hh_dot_dev, 0, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMemset"
+          stop 1
+        endif
+
+        num =  (max_blk_size)* size_of_datatype
+        successCUDA = cuda_malloc( hh_tau_dev, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMalloc"
+          stop 1
+        endif
+
+        successCUDA = cuda_memset( hh_tau_dev, 0, num)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+          &MATH_DATATYPE&
+          &: error in cudaMemset"
+          stop 1
+        endif
+      endif ! useGPU
+
+      current_tv_off = 0 ! Offset of next row to be broadcast
+
+      ! ------------------- start of work loop -------------------
+
+      a_off = 0 ! offset in aIntern (to avoid unnecessary shifts)
+
+      top_msg_length = 0
+      bottom_msg_length = 0
+
+      do sweep = 0, (na-1)/nbw
+
+        current_n = na - sweep*nbw
+        call determine_workload(obj,current_n, nbw, np_rows, limits)
+        current_n_start = limits(my_prow)
+        current_n_end   = limits(my_prow+1)
+        current_local_n = current_n_end - current_n_start
+
+        next_n = max(current_n - nbw, 0)
+        call determine_workload(obj,next_n, nbw, np_rows, limits)
+        next_n_start = limits(my_prow)
+        next_n_end   = limits(my_prow+1)
+        next_local_n = next_n_end - next_n_start
+
+        if (next_n_end < next_n) then
+          bottom_msg_length = current_n_end - next_n_end
+        else
+          bottom_msg_length = 0
+        endif
+
+        if (next_local_n > 0) then
+          next_top_msg_length = current_n_start - next_n_start
+        else
+          next_top_msg_length = 0
+        endif
+
+        if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+#endif
+          do i = 1, stripe_count
+
+#ifdef WITH_OPENMP
+
+            if (useGPU) then
+              print *,"trans_ev_tridi_to_band_real: not yet implemented"
+              stop 1
+            endif
+
+            csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
+            b_len = csw*nbw*max_threads
+#ifdef WITH_MPI
+            call MPI_Irecv(bottom_border_recv_buffer(1,i), int(b_len,kind=MPI_KIND), &
+                           MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), &
+                           int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
+                           bottom_recv_request(i), mpierr)
+
+#else /* WITH_MPI */
+!            carefull the "recieve" has to be done at the corresponding wait or send
+!            bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
+#endif /* WITH_MPI */
+
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+            call MPI_Irecv(bottom_border_recv_buffer(1,1,i), int(nbw*stripe_width,kind=MPI_KIND), &
+                           MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), &
+                           int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND),      &
+                           bottom_recv_request(i), mpierr)
+#else  /* WITH_MPI */
+!            carefull the recieve has to be done at the corresponding wait or send
+!            bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
+#endif /* WITH_MPI */
+
+#endif /* WITH_OPENMP */
+
+          enddo
+#if WITH_MPI
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+        endif
+
+        if (current_local_n > 1) then
+          if (my_pcol == mod(sweep,np_cols)) then
+            bcast_buffer(:,1:current_local_n) =    &
+                  hh_trans(:,current_tv_off+1:current_tv_off+current_local_n)
+            current_tv_off = current_tv_off + current_local_n
+          endif
+
+#ifdef WITH_MPI
+           if (wantDebug) call obj%timer%start("mpi_communication")
+           call mpi_bcast(bcast_buffer, int(nbw*current_local_n,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                          int(mod(sweep,np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
+           if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+
+          if (useGPU) then
+            successCUDA =  cuda_memcpy(bcast_buffer_dev, int(loc(bcast_buffer(1,1)),kind=c_intptr_t),  &
+                                       nbw * current_local_n *    &
+                                       size_of_datatype, &
+                                       cudaMemcpyHostToDevice)
+            if (.not.(successCUDA)) then
+              print *,"trans_ev_tridi_to_band_&
+              &MATH_DATATYPE&
+              &: error in cudaMemcpy"
+              stop 1
+            endif
+
+            call extract_hh_tau_&
+                 &MATH_DATATYPE&
+                 &_gpu_&
+                 &PRECISION &
+!#if REALCASE == 1
+                          (bcast_buffer_dev, hh_tau_dev, nbw, &
+!#endif
+!#if COMPLEXCASE == 1
+!                          (                              nbw, &
+!#endif
+                           current_local_n, .false.)
+      call compute_hh_dot_products_&
+           &MATH_DATATYPE&
+           &_gpu_&
+           &PRECISION &
+                     (bcast_buffer_dev, hh_dot_dev, nbw, &
+                            current_local_n)
+          endif ! useGPU
+
+        else ! (current_local_n > 1) then
+
+          ! for current_local_n == 1 the one and only HH Vector is 0 and not stored in hh_trans_real/complex
+          bcast_buffer(:,1) = 0.0_rck
+          if (useGPU) then
+            successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_datatype)
+            if (.not.(successCUDA)) then
+              print *,"trans_ev_tridi_to_band_&
+              &MATH_DATATYPE&
+              &: error in cudaMemset"
+              stop 1
+            endif
+
+            call extract_hh_tau_&
+                 &MATH_DATATYPE&
+                 &_gpu_&
+                 &PRECISION&
+                 &( &
+        bcast_buffer_dev, hh_tau_dev, &
+        nbw, 1, .true.)
+          endif ! useGPU
+        endif ! (current_local_n > 1) then
+
+        if (l_nev == 0) cycle
+
+        if (current_local_n > 0) then
+
+          do i = 1, stripe_count
+#ifdef WITH_OPENMP
+            if (useGPU) then
+              print *,"trans_ev_tridi_to_band_real: not yet implemented"
+              stop 1
+            endif
+
+            ! Get real stripe width for strip i;
+            ! The last OpenMP tasks may have an even smaller stripe with,
+            ! but we don't care about this, i.e. we send/recv a bit too much in this case.
+            ! csw: current_stripe_width
+
+            csw = min(stripe_width, thread_width-(i-1)*stripe_width)
+#endif /* WITH_OPENMP */
+
+            !wait_b
+            if (current_n_end < current_n) then
+
+
+#ifdef WITH_OPENMP
+              if (useGPU) then
+                print *,"trans_ev_tridi_to_band_real: not yet implemented"
+                stop 1
+              endif
+
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+
+              call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
+              do my_thread = 1, max_threads
+                n_off = current_local_n+a_off
+                b_len = csw*nbw
+                b_off = (my_thread-1)*b_len
+                aIntern(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
+                  reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
+              enddo
+!$omp end parallel do
+              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+              call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif
+              n_off = current_local_n+a_off
+
+              if (useGPU) then
+                dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
+                successCUDA =  cuda_memcpy( aIntern_dev + dev_offset , &
+                                           int(loc(bottom_border_recv_buffer(1,1,i)),kind=c_intptr_t), &
+                                           stripe_width*nbw*  size_of_datatype,    &
+                                           cudaMemcpyHostToDevice)
+                if (.not.(successCUDA)) then
+                  print *,"trans_ev_tridi_to_band_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMemcpy"
+                  stop 1
+                endif
+
+              else
+                aIntern(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
+              endif
+
+#endif /* WITH_OPENMP */
+
+           if (next_n_end < next_n) then
+
+#ifdef WITH_OPENMP
+
+             if (useGPU) then
+               print *,"trans_ev_tridi_to_band_real: not yet implemented"
+               stop 1
+             endif
+#ifdef WITH_MPI
+             if (wantDebug) call obj%timer%start("mpi_communication")
+             call MPI_Irecv(bottom_border_recv_buffer(1,i), int(csw*nbw*max_threads,kind=MPI_KIND), &
+                            MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), &
+                            int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND),      &
+                            bottom_recv_request(i), mpierr)
+             if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WTIH_MPI */
+!                carefull the recieve has to be done at the corresponding wait or send
+!                bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
+
+#endif /* WITH_MPI */
+
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+             if (wantDebug) call obj%timer%start("mpi_communication")
+             call MPI_Irecv(bottom_border_recv_buffer(1,1,i), int(nbw*stripe_width,kind=MPI_KIND), &
+                            MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), &
+                            int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND),      &
+                            bottom_recv_request(i), mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+
+!!                carefull the recieve has to be done at the corresponding wait or send
+!!                bottom_border_recv_buffer(1:stripe_width,1:nbw,i) =  top_border_send_buffer(1:stripe_width,1:nbw,i)
+
+#endif /* WITH_MPI */
+
+#endif /* WITH_OPENMP */
+           endif
+         endif
+
+         if (current_local_n <= bottom_msg_length + top_msg_length) then
+
+           !wait_t
+           if (top_msg_length>0) then
+
+#ifdef WITH_OPENMP
+             if (useGPU) then
+               print *,"trans_ev_tridi_to_band_&
+                       &MATH_DATATYPE&
+                       &: not yet implemented"
+               stop 1
+             endif
+#ifdef WITH_MPI
+             if (wantDebug) call obj%timer%start("mpi_communication")
+
+             call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
+             if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+             if (wantDebug) call obj%timer%start("mpi_communication")
+             call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
+             if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+
+             if (useGPU) then
+               dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
+               !             host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
+               successCUDA =  cuda_memcpy( aIntern_dev+dev_offset , int(loc(top_border_recv_buffer(1,1,i)),kind=c_intptr_t),  &
+                                           stripe_width*top_msg_length* size_of_datatype,      &
+                                           cudaMemcpyHostToDevice)
+               if (.not.(successCUDA)) then
+                 print *,"trans_ev_tridi_to_band_&
+                         &MATH_DATATYPE&
+                         &: error in cudaMemcpy"
+                 stop 1
+                endif
+             else ! useGPU
+               aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
+             endif ! useGPU
+#endif /* WITH_OPENMP */
+           endif ! top_msg_length
+
+           !compute
+#ifdef WITH_OPENMP
+
+           call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
+           do my_thread = 1, max_threads
+             if (top_msg_length>0) then
+               b_len = csw*top_msg_length
+               b_off = (my_thread-1)*b_len
+               aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
+                          reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
+             endif
+
+       call compute_hh_trafo_&
+            &MATH_DATATYPE&
+            &_openmp_&
+            &PRECISION &
+                              (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
+             l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
+#if REALCASE == 1
+                               hh_dot_dev, &
+#endif
+                               hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, &
+             i, my_thread, thread_width, kernel)
+           enddo
+!$omp end parallel do
+           call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+#else /* WITH_OPENMP */
+
+           call compute_hh_trafo_&
+                &MATH_DATATYPE&
+                &_&
+                &PRECISION&
+     & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,  max_threads,      &
+              a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev,      &
+#if REALCASE == 1
+              hh_dot_dev, &
+#endif
+              hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i,          &
+              last_stripe_width, kernel)
+#endif /* WITH_OPENMP */
+
+           !send_b        1
+#ifdef WITH_MPI
+           if (wantDebug) call obj%timer%start("mpi_communication")
+           call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
+           if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+
+           if (bottom_msg_length>0) then
+             n_off = current_local_n+nbw-bottom_msg_length+a_off
+#ifdef WITH_OPENMP
+             b_len = csw*bottom_msg_length*max_threads
+             bottom_border_send_buffer(1:b_len,i) = &
+                 reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
+#ifdef WITH_MPI
+             if (wantDebug) call obj%timer%start("mpi_communication")
+             call MPI_Isend(bottom_border_send_buffer(1,i), int(b_len,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                            int(my_prow+1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
+                            bottom_send_request(i), mpierr)
+             if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+             if (next_top_msg_length > 0) then
+               top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* &
+                                            next_top_msg_length*max_threads,i)
+             endif
+
+#endif /* WITH_MPI */
+
+!#if REALCASE == 1
+           endif ! this endif is not here in complex -case is for bottom_msg_length
+!#endif
+
+#else /* WITH_OPENMP */
+
+             if (useGPU) then
+               dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
+               successCUDA =  cuda_memcpy( int(loc(bottom_border_send_buffer(1,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
+                                          stripe_width * bottom_msg_length * size_of_datatype,      &
+                                          cudaMemcpyDeviceToHost)
+               if (.not.(successCUDA)) then
+                 print *,"trans_ev_tridi_to_band_&
+                 &MATH_DATATYPE&
+                 &: error in cudaMemcpy"
+                 stop 1
+               endif
+             else
+               bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
+             endif
+#ifdef WITH_MPI
+             if (wantDebug) call obj%timer%start("mpi_communication")
+             call MPI_Isend(bottom_border_send_buffer(1,1,i), int(bottom_msg_length*stripe_width,kind=MPI_KIND),  &
+                   MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), &
+                   int(mpi_comm_rows,kind=MPI_KIND), bottom_send_request(i), mpierr)
+             if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+                if (next_top_msg_length > 0) then
+                  top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
+                  bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
+                endif
+
+#endif /* WITH_MPI */
+           endif
+#endif /* WITH_OPENMP */
+
+         else ! current_local_n <= bottom_msg_length + top_msg_length
+
+         !compute
+#ifdef WITH_OPENMP
+         if (useGPU) then
+           print *,"trans_ev_tridi_to_band_real: not yet implemented"
+           stop 1
+         endif
+         call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+        !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
+        do my_thread = 1, max_threads
+
+          call compute_hh_trafo_&
+               &MATH_DATATYPE&
+               &_openmp_&
+               &PRECISION&
+               & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
+                 nbw, max_blk_size,  bcast_buffer, bcast_buffer_dev, &
+#if REALCASE == 1
+                 hh_dot_dev,  &
+#endif
+                 hh_tau_dev, kernel_flops, kernel_time, n_times, current_local_n - bottom_msg_length, &
+                 bottom_msg_length, i, my_thread, thread_width, kernel)
+        enddo
+!$omp end parallel do
+        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+        !send_b
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+        if (bottom_msg_length > 0) then
+          n_off = current_local_n+nbw-bottom_msg_length+a_off
+          b_len = csw*bottom_msg_length*max_threads
+          bottom_border_send_buffer(1:b_len,i) = &
+              reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Isend(bottom_border_send_buffer(1,i), int(b_len,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                         int(my_prow+1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
+                         bottom_send_request(i), mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+          if (next_top_msg_length > 0) then
+            top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* &
+                                                                                                     next_top_msg_length*&
+                                                          max_threads,i)
+          endif
+#endif /* WITH_MPI */
+        endif
+
+#else /* WITH_OPENMP */
+
+        call compute_hh_trafo_&
+             &MATH_DATATYPE&
+             &_&
+             &PRECISION&
+             & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,  max_threads,      &
+                      a_off,  nbw, max_blk_size, bcast_buffer, bcast_buffer_dev,      &
+#if REALCASE == 1
+            hh_dot_dev, &
+#endif
+           hh_tau_dev, kernel_flops, kernel_time, n_times,                 &
+           current_local_n - bottom_msg_length, bottom_msg_length, i,             &
+           last_stripe_width, kernel)
+
+
+
+        !send_b
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+
+        call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+        if (bottom_msg_length > 0) then
+          n_off = current_local_n+nbw-bottom_msg_length+a_off
+
+          if (useGPU) then
+            dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
+            successCUDA =  cuda_memcpy(int(loc(bottom_border_send_buffer(1,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset,  &
+                                         stripe_width*bottom_msg_length* size_of_datatype,  &
+                                         cudaMemcpyDeviceToHost)
+            if (.not.(successCUDA)) then
+              print *,"trans_ev_tridi_to_band_&
+              &MATH_DATATYPE&
+              &: error cudaMemcpy"
+              stop 1
+            endif
+          else
+            bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
+          endif
+
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Isend(bottom_border_send_buffer(1,1,i), int(bottom_msg_length*stripe_width,kind=MPI_KIND), &
+                         MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), &
+                         int(mpi_comm_rows,kind=MPI_KIND), bottom_send_request(i), mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+                if (next_top_msg_length > 0) then
+                  top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
+                  bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
+                endif
+
+#endif /* WITH_MPI */
+
+#if REALCASE == 1
+        endif
+#endif
+
+#endif /* WITH_OPENMP */
+
+#ifndef WITH_OPENMP
+#if COMPLEXCASE == 1
+        endif
+#endif
+#endif
+        !compute
+#ifdef WITH_OPENMP
+
+        call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+!$omp parallel do private(my_thread), schedule(static, 1)
+        do my_thread = 1, max_threads
+          call compute_hh_trafo_&
+          &MATH_DATATYPE&
+          &_openmp_&
+          &PRECISION&
+          & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
+             nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
+#if REALCASE == 1
+             hh_dot_dev, &
+#endif
+             hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,&
+             current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
+             kernel)
+        enddo
+!$omp end parallel do
+        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+#else /* WITH_OPENMP */
+
+        call compute_hh_trafo_&
+             &MATH_DATATYPE&
+             &_&
+             &PRECISION&
+             & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,  max_threads,          &
+                a_off,  nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
+#if REALCASE == 1
+                hh_dot_dev,     &
+#endif
+                hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,                    &
+                current_local_n-top_msg_length-bottom_msg_length, i,                       &
+                last_stripe_width, kernel)
+
+#endif /* WITH_OPENMP */
+
+        !wait_t
+        if (top_msg_length>0) then
+#ifdef WITH_OPENMP
+
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+          if (useGPU) then
+            dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
+            successCUDA =  cuda_memcpy( aIntern_dev + dev_offset ,int(loc( top_border_recv_buffer(:,1,i)),kind=c_intptr_t),  &
+                                       stripe_width * top_msg_length * size_of_datatype,   &
+               cudaMemcpyHostToDevice)
+            if (.not.(successCUDA)) then
+              print *,"trans_ev_tridi_to_band_&
+                      &MATH_DATATYPE&
+                      &: error in cudaMemcpy"
+              stop 1
+            endif
+          else
+            aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
+          endif
+#endif /* WITH_OPENMP */
+        endif
+
+        !compute
+#ifdef WITH_OPENMP
+
+        call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
+        do my_thread = 1, max_threads
+          if (top_msg_length>0) then
+            b_len = csw*top_msg_length
+            b_off = (my_thread-1)*b_len
+            aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
+              reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
+          endif
+          call compute_hh_trafo_&
+               &MATH_DATATYPE&
+               &_openmp_&
+               &PRECISION&
+               & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
+                  nbw, max_blk_size,  bcast_buffer, bcast_buffer_dev, &
+#if REALCASE == 1
+             hh_dot_dev, &
+#endif
+             hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, my_thread, &
+       thread_width, kernel)
+        enddo
+!$omp end parallel do
+        call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+#else /* WITH_OPENMP */
+
+        call compute_hh_trafo_&
+             &MATH_DATATYPE&
+             &_&
+             &PRECISION&
+             & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,  max_threads,          &
+                      a_off, nbw, max_blk_size,  bcast_buffer, bcast_buffer_dev,          &
+#if REALCASE == 1
+               hh_dot_dev,     &
+#endif
+           hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i,               &
+           last_stripe_width, kernel)
+
+#endif /* WITH_OPENMP */
+      endif
+
+      if (next_top_msg_length > 0) then
+        !request top_border data
+#ifdef WITH_OPENMP
+
+        b_len = csw*next_top_msg_length*max_threads
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call MPI_Irecv(top_border_recv_buffer(1,i), int(b_len,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                       int(my_prow-1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
+                       top_recv_request(i), mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+!             carefull the "recieve" has to be done at the corresponding wait or send
+!              top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
+!                                     bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
+#endif /* WITH_MPI */
+
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call MPI_Irecv(top_border_recv_buffer(1,1,i), int(next_top_msg_length*stripe_width,kind=MPI_KIND), &
+                       MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow-1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), &
+                       int(mpi_comm_rows,kind=MPI_KIND), top_recv_request(i), mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+!             carefull the "recieve" has to be done at the corresponding wait or send
+!              top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
+!               bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
+#endif /* WITH_MPI */
+
+#endif /* WITH_OPENMP */
+
+      endif
+
+      !send_t
+      if (my_prow > 0) then
+#ifdef WITH_OPENMP
+
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+        b_len = csw*nbw*max_threads
+        top_border_send_buffer(1:b_len,i) = reshape(aIntern(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
+
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call MPI_Isend(top_border_send_buffer(1,i), int(b_len,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                       int(my_prow-1,kind=MPI_KIND), int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
+                       top_send_request(i), mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+              if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
+                bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
+              endif
+              if (next_n_end < next_n) then
+                bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
+              endif
+#endif /* WITH_MPI */
+
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+        if (useGPU) then
+          dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
+          successCUDA =  cuda_memcpy( int(loc(top_border_send_buffer(:,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
+                                     stripe_width*nbw * size_of_datatype, &
+                                     cudaMemcpyDeviceToHost)
+          if (.not.(successCUDA)) then
+            print *,"trans_ev_tridi_to_band_&
+                    &MATH_DATATYPE&
+                    &: error in cudaMemcpy"
+            stop 1
+          endif
+
+        else
+          top_border_send_buffer(:,1:nbw,i) = aIntern(:,a_off+1:a_off+nbw,i)
+        endif
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call MPI_Isend(top_border_send_buffer(1,1,i), int(nbw*stripe_width,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                       int(my_prow-1,kind=MPI_KIND), int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND),   &
+                       top_send_request(i), mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+            if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
+               bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
+             endif
+             if (next_n_end < next_n) then
+               bottom_border_recv_buffer(1:stripe_width,1:nbw,i) =  top_border_send_buffer(1:stripe_width,1:nbw,i)
+             endif
+#endif /* WITH_MPI */
+
+#endif /* WITH_OPENMP */
+      endif
+
+      ! Care that there are not too many outstanding top_recv_request's
+      if (stripe_count > 1) then
+        if (i>1) then
+
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+        else
+
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+
+        endif
+      endif
+
+    enddo
+
+    top_msg_length = next_top_msg_length
+
+  else
+    ! wait for last top_send_request
+
+#ifdef WITH_MPI
+    do i = 1, stripe_count
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+    enddo
+#endif
+  endif
+
+    ! Care about the result
+
+    if (my_prow == 0) then
+
+      ! topmost process sends nbw rows to destination processes
+
+      do j=0, nfact-1
+        num_blk = sweep*nfact+j ! global number of destination block, 0 based
+        if (num_blk*nblk >= na) exit
+
+        nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
+
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif
+        dst = mod(num_blk, np_rows)
+
+        if (dst == 0) then
+          if (useGPU) then
+            row_group_size = min(na - num_blk*nblk, nblk)
+            call pack_row_group_&
+                 &MATH_DATATYPE&
+                 &_gpu_&
+                 &PRECISION&
+                 &(row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
+                         row_group(:, :), j * nblk + a_off, row_group_size)
+
+            do i = 1, row_group_size
+              q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
+            enddo
+          else ! useGPU
+
+            do i = 1, min(na - num_blk*nblk, nblk)
+#ifdef WITH_OPENMP
+              call pack_row_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   &(obj,aIntern, row, j*nblk+i+a_off, stripe_width, stripe_count, max_threads, thread_width, l_nev)
+#else /* WITH_OPENMP */
+
+              call pack_row_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   &(obj,aIntern, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
+#endif /* WITH_OPENMP */
+              q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
+            enddo
+          endif ! useGPU
+
+        else ! (dst == 0)
+
+          if (useGPU) then
+            call pack_row_group_&
+                 &MATH_DATATYPE&
+                 &_gpu_&
+                 &PRECISION&
+                 &(row_group_dev, aIntern_dev, stripe_count, stripe_width, &
+                   last_stripe_width, a_dim2, l_nev, &
+                   result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
+
+          else  ! useGPU
+            do i = 1, nblk
+#if WITH_OPENMP
+              call pack_row_&
+                   &MATH_DATATYPE&
+                   &_cpu_openmp_&
+                   &PRECISION&
+                   &(obj,aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, stripe_width, stripe_count, &
+                   max_threads, thread_width, l_nev)
+#else /* WITH_OPENMP */
+              call pack_row_&
+                   &MATH_DATATYPE&
+                   &_cpu_&
+                   &PRECISION&
+                   &(obj, aIntern, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
+#endif /* WITH_OPENMP */
+            enddo
+          endif ! useGPU
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Isend(result_buffer(1,1,nbuf), int(l_nev*nblk,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                         int(dst,kind=MPI_KIND), int(result_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
+                         result_send_request(nbuf), mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+          if (j+num_result_buffers < num_result_blocks) &
+                   result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf)
+          if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
+            do j1 = 1, min(num_result_buffers, num_result_blocks)
+              result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf)
+            enddo
+          endif
+
+#endif /* WITH_MPI */
+        endif ! (dst == 0)
+      enddo  !j=0, nfact-1
+
+    else ! (my_prow == 0)
+
+      ! receive and store final result
+
+      do j = num_bufs_recvd, num_result_blocks-1
+
+        nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
+
+        ! If there is still work to do, just test for the next result request
+        ! and leave the loop if it is not ready, otherwise wait for all
+        ! outstanding requests
+
+        if (next_local_n > 0) then
+
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+          flag = .true.
+#endif
+
+          if (.not.flag) exit
+
+        else ! (next_local_n > 0)
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+        endif ! (next_local_n > 0)
+
+        ! Fill result buffer into q
+        num_blk = j*np_rows + my_prow ! global number of current block, 0 based
+        do i = 1, min(na - num_blk*nblk, nblk)
+          q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
+        enddo
+
+        ! Queue result buffer again if there are outstanding blocks left
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+
+        if (j+num_result_buffers < num_result_blocks) &
+            call MPI_Irecv(result_buffer(1,1,nbuf), int(l_nev*nblk,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                           0_MPI_KIND, int(result_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
+                           result_recv_request(nbuf), mpierr)
+
+        ! carefull the "recieve" has to be done at the corresponding wait or send
+!         if (j+num_result_buffers < num_result_blocks) &
+!                result_buffer(1:l_nev*nblk,1,nbuf) =  result_buffer(1:l_nev*nblk,1,nbuf)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+
+#endif /* WITH_MPI */
+
+      enddo ! j = num_bufs_recvd, num_result_blocks-1
+      num_bufs_recvd = j
+
+    endif ! (my_prow == 0)
+
+    ! Shift the remaining rows to the front of aIntern (if necessary)
+
+    offset = nbw - top_msg_length
+    if (offset<0) then
+      if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
+                                         &MATH_DATATYPE&
+                                         &: internal error, offset for shifting = ',offset
+      success = .false.
+      return
+    endif
+
+    a_off = a_off + offset
+    if (a_off + next_local_n + nbw >= a_dim2) then
+#ifdef WITH_OPENMP
+      if (useGPU) then
+        print *,"trans_ev_tridi_to_band_real: not yet implemented"
+        stop 1
+      endif
+
+      call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+!$omp parallel do private(my_thread, i, j), schedule(static, 1)
+      do my_thread = 1, max_threads
+        do i = 1, stripe_count
+          do j = top_msg_length+1, top_msg_length+next_local_n
+            aIntern(:,j,i,my_thread) = aIntern(:,j+a_off,i,my_thread)
+          enddo
+        enddo
+      enddo
+!$omp end parallel do
+      call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+#else /* WITH_OPENMP */
+         do i = 1, stripe_count
+           if (useGPU) then
+             chunk = min(next_local_n - 1, a_off)
+             do j = top_msg_length + 1, top_msg_length + next_local_n, chunk
+               top = min(j + chunk, top_msg_length + next_local_n)
+               this_chunk = top - j + 1
+               dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
+               dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
+               ! it is not logical to set here always the value for the parameter
+               ! "cudaMemcpyDeviceToDevice" do this ONCE at startup
+               !               tmp = cuda_d2d(1)
+               successCUDA =  cuda_memcpy( aIntern_dev + dev_offset , aIntern_dev +dev_offset_1, &
+                                         stripe_width*this_chunk* size_of_datatype, cudaMemcpyDeviceToDevice)
+               if (.not.(successCUDA)) then
+                 print *,"trans_ev_tridi_to_band_&
+                         &MATH_DATATYPE&
+                         &: error cudaMemcpy"
+                 stop 1
+               endif
+             enddo
+           else ! not useGPU
+             do j = top_msg_length+1, top_msg_length+next_local_n
+               aIntern(:,j,i) = aIntern(:,j+a_off,i)
+             enddo
+           endif
+         enddo ! stripe_count
+#endif /* WITH_OPENMP */
+
+         a_off = 0
+       endif
+     enddo
+
+     ! Just for safety:
+#ifdef WITH_MPI
+     if (ANY(top_send_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
+     if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
+     if (ANY(top_recv_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
+     if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
+#endif
+
+     if (my_prow == 0) then
+
+#ifdef WITH_MPI
+       if (wantDebug) call obj%timer%start("mpi_communication")
+       call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
+       if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+     endif
+
+#ifdef WITH_MPI
+     if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
+     if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
+
+
+     call obj%get("print_flops",print_flops,error)
+#ifdef HAVE_DETAILED_TIMINGS
+     if (print_flops == 1) then
+       call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_ROWS, mpierr)
+       kernel_flops = kernel_flops_recv
+       call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_COLS, mpierr)
+       kernel_flops = kernel_flops_recv
+
+       call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_ROWS, mpierr)
+       kernel_time_recv = kernel_time
+       call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_COLS, mpierr)
+       kernel_time_recv = kernel_time
+     endif
+#endif
+
+#endif /* WITH_MPI */
+
+     if (my_prow==0 .and. my_pcol==0 .and.print_flops == 1) &
+         write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)')  kernel_time, kernel_flops/kernel_time*1.d-6
+
+     if (useGPU) then
+     ! copy q to q_dev needed in trans_ev_band_to_full
+        successCUDA = cuda_malloc(q_dev, ldq*matrixCols* size_of_datatype)
+        if (.not.(successCUDA)) then
+          print *,"trans_ev_tridi_to_band_&
+                  &MATH_DATATYPE&
+                  &: error in cudaMalloc"
+          stop 1
+        endif
+
+          ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
+          successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), (ldq)*(matrixCols)* size_of_datatype,   &
+                  cudaMemcpyHostToDevice)
+          if (.not.(successCUDA)) then
+            print *,"trans_ev_tridi_to_band_&
+                    &MATH_DATATYPE&
+                    &: error in cudaMalloc"
+            stop 1
+          endif
+!        endif
+     endif !use GPU
+
+     ! deallocate all working space
+
+     if (.not.(useGPU)) then
+       nullify(aIntern)
+       call free(aIntern_ptr)
+     endif
+
+     deallocate(row, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating row "//errorMessage
+       stop 1
+     endif
+
+     deallocate(limits, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating limits"//errorMessage
+       stop 1
+     endif
+
+     deallocate(result_send_request, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating result_send_request "//errorMessage
+       stop 1
+     endif
+
+     deallocate(result_recv_request, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating result_recv_request "//errorMessage
+       stop 1
+     endif
+
+     deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating top_border_send_buffer "//errorMessage
+       stop 1
+     endif
+
+     deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating top_border_recv_buffer "//errorMessage
+       stop 1
+     endif
+
+     deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating bottom_border_send_buffer "//errorMessage
+       stop 1
+     endif
+
+     deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating bottom_border_recv_buffer "//errorMessage
+       stop 1
+     endif
+
+     deallocate(result_buffer, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+       &MATH_DATATYPE&
+       &: error when deallocating result_buffer "//errorMessage
+       stop 1
+     endif
+
+     deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating bcast_buffer "//errorMessage
+       stop 1
+     endif
+
+     deallocate(top_send_request, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating top_send_request "//errorMessage
+       stop 1
+     endif
+
+     deallocate(top_recv_request, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating top_recv_request "//errorMessage
+       stop 1
+     endif
+
+     deallocate(bottom_send_request, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating bottom_send_request "//errorMessage
+       stop 1
+     endif
+
+     deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage)
+     if (istat .ne. 0) then
+       print *,"trans_ev_tridi_to_band_&
+               &MATH_DATATYPE&
+               &: error when deallocating bottom_recv_request "//errorMessage
+       stop 1
+     endif
+
+     if (useGPU) then
+#if COMPLEXCASE == 1
+       ! should this not hbe done always?
+       successCUDA = cuda_free(aIntern_dev)
+       if (.not.(successCUDA)) then
+         print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
+         stop 1
+       endif
+#endif
+       successCUDA = cuda_free(hh_dot_dev)
+       if (.not.(successCUDA)) then
+         print *,"trans_ev_tridi_to_band_&
+                 &MATH_DATATYPE&
+                 &real: error in cudaFree "//errorMessage
+         stop 1
+       endif
+
+       successCUDA = cuda_free(hh_tau_dev)
+       if (.not.(successCUDA)) then
+         print *,"trans_ev_tridi_to_band_&
+                 &MATH_DATATYPE&
+                 &: error in cudaFree "//errorMessage
+         stop 1
+       endif
+
+       successCUDA = cuda_free(row_dev)
+       if (.not.(successCUDA)) then
+         print *,"trans_ev_tridi_to_band_&
+                 &MATH_DATATYPE&
+                 &: error in cudaFree "//errorMessage
+         stop 1
+       endif
+
+       deallocate(row_group, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"trans_ev_tridi_to_band_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating row_group "//errorMessage
+         stop 1
+       endif
+
+       successCUDA = cuda_free(row_group_dev)
+       if (.not.(successCUDA)) then
+         print *,"trans_ev_tridi_to_band_&
+                 &MATH_DATATYPE&
+                 &: error in cudaFree "//errorMessage
+         stop 1
+       endif
+
+       successCUDA =  cuda_free(bcast_buffer_dev)
+       if (.not.(successCUDA)) then
+         print *,"trans_ev_tridi_to_band_&
+                 &MATH_DATATYPE&
+                 &: error in cudaFree "//errorMessage
+         stop 1
+       endif
+     endif ! useGPU
+
+
+     call obj%timer%stop("trans_ev_tridi_to_band_&
+                         &MATH_DATATYPE&
+                         &" // &
+                         &PRECISION_SUFFIX //&
+                         gpuString)
+
+     return
+!#if COMPLEXCASE == 1
+!     contains
+!     ! The host wrapper for extracting "tau" from the HH reflectors (see the
+!     ! kernel below)
+!       subroutine extract_hh_tau_complex_gpu_&
+!       &PRECISION&
+!       &(nbw, n, is_zero)
+!         use cuda_c_kernel
+!         use pack_unpack_gpu
+!         use precision
+!         implicit none
+!         integer(kind=ik), value :: nbw, n
+!         logical, value          :: is_zero
+!         integer(kind=ik)        :: val_is_zero
+!
+!         if (is_zero) then
+!           val_is_zero = 1
+!         else
+!           val_is_zero = 0
+!         endif
+!         call launch_extract_hh_tau_c_kernel_complex_&
+!   &PRECISION&
+!   &(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
+!       end subroutine
+!#endif /* COMPLEXCASE */
+
+    end subroutine
+
+! vim: syntax=fortran
diff -Nru elpa-2016.05.001/src/elpa2/elpa2_tridiag_band_template.F90 elpa-2019.11.001/src/elpa2/elpa2_tridiag_band_template.F90
--- elpa-2016.05.001/src/elpa2/elpa2_tridiag_band_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/elpa2_tridiag_band_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1324 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#include "../general/sanity.F90"
+
+    subroutine tridiag_band_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION &
+    (obj, na, nb, nblk, a_mat, a_dev, lda, d, e, matrixCols, &
+    hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug, nrThreads)
+    !-------------------------------------------------------------------------------
+    ! tridiag_band_real/complex:
+    ! Reduces a real symmetric band matrix to tridiagonal form
+    !
+    !  na          Order of matrix a
+    !
+    !  nb          Semi bandwith
+    !
+    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
+    !
+    !  a_mat(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
+    !
+    !  lda         Leading dimension of a
+    !  matrixCols  local columns of matrix a
+    !
+    ! hh_trans : housholder vectors
+    !
+    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
+    !
+    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
+    !
+    !  mpi_comm_rows
+    !  mpi_comm_cols
+    !              MPI-Communicators for rows/columns
+    !  communicator
+    !              MPI-Communicator for the total processor set
+    !-------------------------------------------------------------------------------
+      use elpa_abstract_impl
+      use elpa2_workload
+      use precision
+      use iso_c_binding
+      use redist
+#ifdef WITH_OPENMP
+      use omp_lib
+#endif
+      use elpa_blas_interfaces
+      use elpa_skewsymmetric_blas
+      implicit none
+#include "../general/precision_kinds.F90"
+      class(elpa_abstract_impl_t), intent(inout)   :: obj
+      logical, intent(in)                          :: useGPU, wantDebug
+      integer(kind=c_int)                          :: skewsymmetric
+      logical                                      :: isSkewsymmetric
+      integer(kind=ik), intent(in)                 :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck), intent(in)         :: a_mat(lda,*)
+#else
+      MATH_DATATYPE(kind=rck), intent(in)         :: a_mat(lda,matrixCols)
+#endif
+      integer(kind=c_intptr_t)                     :: a_dev
+      real(kind=rk), intent(out)        :: d(na), e(na) ! set only on PE 0
+      MATH_DATATYPE(kind=rck), intent(out), allocatable   :: hh_trans(:,:)
+
+      real(kind=rk)                     :: vnorm2
+      MATH_DATATYPE(kind=rck)                     :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
+      MATH_DATATYPE(kind=rck)                     :: hd(nb), hs(nb)
+
+      integer(kind=ik)                             :: i, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
+      integer(kind=ik)                             :: my_pe, n_pes
+      integer(kind=ik)                             :: my_prow, np_rows, my_pcol, np_cols
+      integer(kind=MPI_KIND)                       :: my_peMPI, n_pesMPI, mpierr
+      integer(kind=MPI_KIND)                       :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
+      integer(kind=MPI_KIND)                       :: ireq_ab, ireq_hv
+      integer(kind=ik)                             :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
+      integer(kind=ik), intent(in)                 :: nrThreads
+#ifdef WITH_OPENMP
+      integer(kind=ik)                             :: max_threads, my_thread, my_block_s, my_block_e, iter
+#ifdef WITH_MPI
+#endif
+      integer(kind=ik), allocatable                :: global_id_tmp(:,:)
+      integer(kind=ik), allocatable                :: omp_block_limits(:)
+      MATH_DATATYPE(kind=rck), allocatable         :: hv_t(:,:), tau_t(:)
+#endif /* WITH_OPENMP */
+      integer(kind=ik), allocatable                :: global_id(:,:), hh_cnt(:), hh_dst(:)
+      integer(kind=MPI_KIND), allocatable          :: ireq_hhr(:), ireq_hhs(:)
+      integer(kind=ik), allocatable                :: limits(:), snd_limits(:,:)
+      integer(kind=ik), allocatable                :: block_limits(:)
+      MATH_DATATYPE(kind=rck), allocatable         :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
+      integer                                      :: istat
+      character(200)                               :: errorMessage
+      character(20)                                :: gpuString
+
+#ifndef WITH_MPI
+      integer(kind=ik)                             :: startAddr
+#endif
+
+      call obj%get("is_skewsymmetric",skewsymmetric,istat)
+      if (istat .ne. ELPA_OK) then
+           print *,"Problem getting option. Aborting..."
+           stop
+      endif
+      isSkewsymmetric = (skewsymmetric == 1)
+      
+      if(useGPU) then
+        gpuString = "_gpu"
+      else
+        gpuString = ""
+      endif
+
+      call obj%timer%start("tridiag_band_&
+      &MATH_DATATYPE&
+      &" // &
+      &PRECISION_SUFFIX //&
+      gpuString)
+
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      call mpi_comm_rank(int(communicator,kind=MPI_KIND) ,my_peMPI ,mpierr)
+      call mpi_comm_size(int(communicator,kind=MPI_KIND) ,n_pesMPI ,mpierr)
+
+      call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND),my_prowMPI ,mpierr)
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND),np_rowsMPI ,mpierr)
+      call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND),my_pcolMPI ,mpierr)
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND),np_colsMPI ,mpierr)
+
+      my_pe = int(my_peMPI,kind=MPI_KIND)
+      n_pes = int(n_pesMPI,kind=MPI_KIND)
+      my_prow = int(my_prowMPI,kind=MPI_KIND)
+      np_rows = int(np_rowsMPI,kind=MPI_KIND)
+      my_pcol = int(my_pcolMPI,kind=MPI_KIND)
+      np_cols = int(np_colsMPI,kind=MPI_KIND)
+      if (wantDebug) call obj%timer%stop(",kind=MPI_KIND)mpi_communication")
+
+      ! Get global_id mapping 2D procssor coordinates to global id
+
+      allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when allocating global_id "//errorMessage
+        stop 1
+      endif
+
+      global_id(:,:) = 0
+      global_id(my_prow, my_pcol) = my_pe
+
+#ifdef WITH_OPENMP
+      allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                &MATH_DATATYPE&
+                &: error when allocating global_id_tmp "//errorMessage
+        stop 1
+      endif
+#endif
+
+#ifdef WITH_MPI
+      if (wantDebug) call obj%timer%start("mpi_communication")
+#ifndef WITH_OPENMP
+      call mpi_allreduce(mpi_in_place, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, &
+                         mpi_sum, int(communicator,kind=MPI_KIND), mpierr)
+#else
+      global_id_tmp(:,:) = global_id(:,:)
+      call mpi_allreduce(global_id_tmp, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, &
+                         mpi_sum, int(communicator,kind=MPI_KIND), mpierr)
+      deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating global_id_tmp "//errorMessage
+        stop 1
+      endif
+#endif /* WITH_OPENMP */
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+      ! Total number of blocks in the band:
+
+      nblocks_total = (na-1)/nb + 1
+
+      ! Set work distribution
+
+      allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when allocating block_limits"//errorMessage
+        stop 1
+      endif
+
+      call divide_band(obj,nblocks_total, n_pes, block_limits)
+
+      ! nblocks: the number of blocks for my task
+      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
+
+      ! allocate the part of the band matrix which is needed by this PE
+      ! The size is 1 block larger than needed to avoid extensive shifts
+      allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when allocating ab"//errorMessage
+        stop 1
+      endif
+
+      ab = 0.0_rck ! needed for lower half, the extra block should also be set to 0 for safety
+
+      ! n_off: Offset of ab within band
+      n_off = block_limits(my_pe)*nb
+
+      ! Redistribute band in a to ab
+      call redist_band_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &(obj,a_mat, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
+
+      ! Calculate the workload for each sweep in the back transformation
+      ! and the space requirements to hold the HH vectors
+
+      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when allocating limits"//errorMessage
+        stop 1
+      endif
+
+      call determine_workload(obj,na, nb, np_rows, limits)
+      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
+
+      num_hh_vecs = 0
+      num_chunks  = 0
+      nx = na
+      do n = 1, nblocks_total
+        call determine_workload(obj, nx, nb, np_rows, limits)
+        local_size = limits(my_prow+1) - limits(my_prow)
+        ! add to number of householder vectors
+        ! please note: for nx==1 the one and only HH Vector is 0 and is neither calculated nor send below!
+        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
+          num_hh_vecs = num_hh_vecs + local_size
+          num_chunks  = num_chunks+1
+        endif
+        nx = nx - nb
+      enddo
+
+      ! Allocate space for HH vectors
+
+      allocate(hh_trans(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+#if REALCASE == 1
+        print *,"tridiag_band_real: error when allocating hh_trans"//errorMessage
+#endif
+#if COMPLEXCASE == 1
+        print *,"tridiag_band_complex: error when allocating hh_trans "//errorMessage
+#endif
+        stop 1
+      endif
+
+      ! Allocate and init MPI requests
+
+      allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when allocating ireq_hhr"//errorMessage
+        stop 1
+      endif
+      allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage)    ! Send requests
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYEP&
+                 &: error when allocating ireq_hhs"//errorMessage
+        stop 1
+      endif
+
+      num_hh_vecs = 0
+      num_chunks  = 0
+      nx = na
+      nt = 0
+      do n = 1, nblocks_total
+        call determine_workload(obj,nx, nb, np_rows, limits)
+        local_size = limits(my_prow+1) - limits(my_prow)
+        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
+          num_chunks  = num_chunks+1
+#ifdef WITH_MPI
+          if (wantDebug) call obj%timer%start("mpi_communication")
+          call mpi_irecv(hh_trans(1,num_hh_vecs+1), int(nb*local_size,kind=MPI_KIND),  MPI_MATH_DATATYPE_PRECISION_EXPL,     &
+                         int(nt,kind=MPI_KIND), int(10+n-block_limits(nt),kind=MPI_KIND), &
+                         int(communicator,kind=MPI_KIND), ireq_hhr(num_chunks), mpierr)
+          if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+          ! carefull non-block recv data copy must be done at wait or send
+          ! hh_trans(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
+
+#endif /* WITH_MPI */
+          num_hh_vecs = num_hh_vecs + local_size
+        endif
+        nx = nx - nb
+        if (n == block_limits(nt+1)) then
+          nt = nt + 1
+        endif
+      enddo
+#ifdef WITH_MPI
+      ireq_hhs(:) = MPI_REQUEST_NULL
+#endif
+      ! Buffers for gathering/sending the HH vectors
+
+      allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when allocating hh_gath"//errorMessage
+        stop 1
+      endif
+
+      allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                &MATH_DATATYPE&
+                &: error when allocating hh_send"//errorMessage
+        stop 1
+      endif
+
+      hh_gath(:,:,:) = 0.0_rck
+      hh_send(:,:,:) = 0.0_rck
+
+      ! Some counters
+
+      allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                &MATH_DATATYPE&
+                &: error when allocating hh_cnt"//errorMessage
+        stop 1
+      endif
+
+      allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                &MATH_DATATYPE&
+                &: error when allocating hh_dst"//errorMessage
+        stop 1
+      endif
+
+      hh_cnt(:) = 1 ! The first transfomation Vector is always 0 and not calculated at all
+      hh_dst(:) = 0 ! PE number for receive
+#ifdef WITH_MPI
+      ireq_ab = MPI_REQUEST_NULL
+      ireq_hv = MPI_REQUEST_NULL
+#endif
+      ! Limits for sending
+
+      allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                &MATH_DATATYPE&
+                &: error when allocating snd_limits"//errorMessage
+        stop 1
+      endif
+      do iblk=1,nblocks
+        call determine_workload(obj, na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
+      enddo
+
+#ifdef WITH_OPENMP
+      ! OpenMP work distribution:
+      max_threads = nrThreads
+      ! For OpenMP we need at least 2 blocks for every thread
+      max_threads = MIN(max_threads, nblocks/2)
+      if (max_threads==0) max_threads = 1
+
+      allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when allocating omp_block_limits"//errorMessage
+        stop 1
+      endif
+
+      ! Get the OpenMP block limits
+      call divide_band(obj,nblocks, max_threads, omp_block_limits)
+
+      allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                &MATH_DATATYPE&
+                &: error when allocating hv_t, tau_t"//errorMessage
+        stop 1
+      endif
+
+      hv_t = 0.0_rck
+      tau_t = 0.0_rck
+#endif /* WITH_OPENMP */
+
+      ! ---------------------------------------------------------------------------
+      ! Start of calculations
+
+      na_s = block_limits(my_pe)*nb + 1
+
+      if (my_pe>0 .and. na_s<=na) then
+        ! send first column to previous PE
+        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
+        ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
+#ifdef WITH_MPI
+        if (wantDebug) call obj%timer%start("mpi_communication")
+        call mpi_isend(ab_s, int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                       int(my_pe-1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
+        if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+      endif
+
+#ifndef WITH_MPI
+          startAddr = ubound(hh_trans,dim=2)
+#endif /* WITH_MPI */
+
+#ifdef WITH_OPENMP
+      do istep=1,na-1-block_limits(my_pe)*nb
+#else
+      do istep=1,na-1
+#endif
+
+        if (my_pe==0) then
+          n = MIN(na-na_s,nb) ! number of rows to be reduced
+          hv(:) = 0.0_rck
+          hd(:) = 0.0_rck
+          tau = 0.0_rck
+
+          ! Transform first column of remaining matrix
+#if REALCASE == 1
+          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
+          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
+#endif
+#if COMPLEXCASE == 1
+         ! Opposed to the real case, the last step (istep=na-1) is needed here for making
+         ! the last subdiagonal element a real number
+#endif
+
+#if REALCASE == 1
+          if (istep < na-1) then
+            ! Transform first column of remaining matrix
+            vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk8)**2+dimag(ab(3:n+1,na_s-n_off))**2)
+#else
+          vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk4)**2+aimag(ab(3:n+1,na_s-n_off))**2)
+#endif
+          if (n<2) vnorm2 = 0.0_rk ! Safety only
+#endif /* COMPLEXCASE */
+
+            call hh_transform_&
+                &MATH_DATATYPE&
+                &_&
+                &PRECISION &
+                               (obj, ab(2,na_s-n_off), vnorm2, hf, tau, wantDebug)
+
+            hv(1) = 1.0_rck
+            hv(2:n) = ab(3:n+1,na_s-n_off)*hf
+#if REALCASE == 1
+          endif
+#endif
+
+#if REALCASE == 1
+          if (isSkewsymmetric) then
+            d(istep) = 0.0_rk
+          else
+            d(istep) = ab(1,na_s-n_off)
+          endif
+          e(istep) = ab(2,na_s-n_off)
+#endif
+#if COMPLEXCASE == 1
+          d(istep) = real(ab(1,na_s-n_off), kind=rk)
+          e(istep) = real(ab(2,na_s-n_off), kind=rk)
+#endif
+
+          if (istep == na-1) then
+#if REALCASE == 1
+            if (isSkewsymmetric) then
+              d(na) = 0
+            else
+              d(na) = ab(1,na_s+1-n_off)
+            endif
+#endif
+
+#if COMPLEXCASE == 1
+            d(na) = real(ab(1,na_s+1-n_off),kind=rk)
+#endif
+            e(na) = 0.0_rck
+          endif
+        else
+          if (na>na_s) then
+            ! Receive Householder Vector from previous task, from PE owning subdiagonal
+
+#ifdef WITH_OPENMP
+
+#ifdef WITH_MPI
+            if (wantDebug) call obj%timer%start("mpi_communication")
+            call mpi_recv(hv, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                          int(my_pe-1,kind=MPI_KIND), 2_MPI_KIND, int(communicator,kind=MPI_KIND), &
+                          MPI_STATUS_IGNORE, mpierr)
+            if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+
+            hv(1:nb) = hv_s(1:nb)
+
+#endif /* WITH_MPI */
+
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+            if (wantDebug) call obj%timer%start("mpi_communication")
+
+            call mpi_recv(hv, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                          int(my_pe-1,kind=MPI_KIND), 2_MPI_KIND, int(communicator,kind=MPI_KIND), &
+                          MPI_STATUS_IGNORE, mpierr)
+            if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#else /* WITH_MPI */
+            hv(1:nb) = hv_s(1:nb)
+#endif /* WITH_MPI */
+
+#endif /* WITH_OPENMP */
+            tau = hv(1)
+            hv(1) = 1.0_rck
+          endif
+        endif
+
+        na_s = na_s+1
+        if (na_s-n_off > nb) then
+          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
+          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0.0_rck
+          n_off = n_off + nb
+        endif
+
+#ifdef WITH_OPENMP
+        if (max_threads > 1) then
+
+          ! Codepath for OpenMP
+
+          ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
+          ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
+          ! This simulates the behaviour of the MPI tasks which also work after each other.
+          ! The code would be considerably easier, if the MPI communication would be made within
+          ! the parallel region - this is avoided here since this would require
+          ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
+
+          hv_t(:,1) = hv
+          tau_t(1) = tau
+
+          do iter = 1, 2
+
+            ! iter=1 : work on first block
+            ! iter=2 : work on remaining blocks
+            ! This is done in 2 iterations so that we have a barrier in between:
+            ! After the first iteration, it is guaranteed that the last row of the last block
+            ! is completed by the next thread.
+            ! After the first iteration it is also the place to exchange the last row
+            ! with MPI calls
+            call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
+
+!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
+!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
+            do my_thread = 1, max_threads
+
+              if (iter == 1) then
+                my_block_s = omp_block_limits(my_thread-1) + 1
+                my_block_e = my_block_s
+              else
+                my_block_s = omp_block_limits(my_thread-1) + 2
+                my_block_e = omp_block_limits(my_thread)
+              endif
+
+              do iblk = my_block_s, my_block_e
+
+                ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
+                ne = ns+nb-1                    ! last column in block
+
+                if (istep<my_thread .or. ns+n_off>na) exit
+
+                hv = hv_t(:,my_thread)
+                tau = tau_t(my_thread)
+
+                ! Store Householder Vector for back transformation
+
+                hh_cnt(iblk) = hh_cnt(iblk) + 1
+
+                hh_gath(1   ,hh_cnt(iblk),iblk) = tau
+                hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
+
+                nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
+                nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
+                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
+
+                ! Transform diagonal block
+                if (wantDebug) call obj%timer%start("blas")
+#if REALCASE == 1
+                if (isSkewsymmetric) then
+                  hd(:) = 0.0_rk
+                  call ELPA_PRECISION_SSMV(int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), hv, hd)
+                else
+                  call PRECISION_SYMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
+                                      hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
+                endif
+#endif
+#if COMPLEXCASE == 1
+                call PRECISION_HEMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
+                                    hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
+#endif
+                if (wantDebug) call obj%timer%stop("blas")
+#if REALCASE == 1
+                if (.NOT. isSkewsymmetric) then
+                  x = dot_product(hv(1:nc),hd(1:nc))*tau
+                endif
+#endif
+#if COMPLEXCASE == 1
+                x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
+#endif
+                if (.NOT. isSkewsymmetric) then
+                  hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
+                endif
+                if (wantDebug) call obj%timer%start("blas")
+#if REALCASE == 1
+                if (isSkewsymmetric) then
+                  call ELPA_PRECISION_SSR2(int(nc,kind=BLAS_KIND), hd,  hv, ab(1,ns), &
+                                           int(2*nb-1,kind=BLAS_KIND) )
+                else
+                  call PRECISION_SYR2('L', int(nc,kind=BLAS_KIND), -ONE, hd, 1_BLAS_KIND, &
+                                    hv, 1_BLAS_KIND, ab(1,ns), int(2*nb-1,kind=BLAS_KIND))
+                endif
+#endif
+#if COMPLEXCASE == 1
+                call PRECISION_HER2('L', int(nc,kind=BLAS_KIND), -ONE, hd, 1_BLAS_KIND, &
+                                    hv, 1_BLAS_KIND, ab(1,ns), int(2*nb-1,kind=BLAS_KIND))
+#endif
+                if (wantDebug) call obj%timer%stop("blas")
+                hv_t(:,my_thread) = 0.0_rck
+                tau_t(my_thread)  = 0.0_rck
+                if (nr<=0) cycle ! No subdiagonal block present any more
+
+                ! Transform subdiagonal block
+                if (wantDebug) call obj%timer%start("blas")
+                call PRECISION_GEMV('N', int(nr,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), tau, &
+                                    ab(nb+1,ns), int(2*nb-1,kind=BLAS_KIND), hv, 1_BLAS_KIND, &
+                                    ZERO, hs, 1_BLAS_KIND)
+                if (wantDebug) call obj%timer%stop("blas")
+                if (nr>1) then
+
+                  ! complete (old) Householder transformation for first column
+
+                  ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
+
+                  ! calculate new Householder transformation for first column
+                  ! (stored in hv_t(:,my_thread) and tau_t(my_thread))
+
+#if REALCASE == 1
+                  vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
+#endif
+#if COMPLEXCASE == 1
+#ifdef  DOUBLE_PRECISION_COMPLEX
+                  vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
+#else
+                  vnorm2 = sum(real(ab(nb+2:nb+nr,ns))**2+aimag(ab(nb+2:nb+nr,ns))**2)
+#endif
+#endif /* COMPLEXCASE */
+
+                  call hh_transform_&
+                  &MATH_DATATYPE&
+                  &_&
+                  &PRECISION &
+                        (obj, ab(nb+1,ns), vnorm2, hf, tau_t(my_thread), wantDebug)
+
+                  hv_t(1   ,my_thread) = 1.0_rck
+                  hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
+                  ab(nb+2:,ns) = 0.0_rck
+                  ! update subdiagonal block for old and new Householder transformation
+                  ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
+                  if (wantDebug) call obj%timer%start("blas")
+                  call PRECISION_GEMV(BLAS_TRANS_OR_CONJ,            &
+                                      int(nr,kind=BLAS_KIND), int(nb-1,kind=BLAS_KIND), &
+                                      tau_t(my_thread), ab(nb,ns+1), int(2*nb-1,kind=BLAS_KIND), &
+                                      hv_t(1,my_thread), 1_BLAS_KIND, ZERO, h(2), 1_BLAS_KIND)
+                  if (wantDebug) call obj%timer%stop("blas")
+
+                  x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
+                  h(2:nb) = h(2:nb) - x*hv(2:nb)
+                  ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
+                  do i=2,nb
+                    ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*  &
+#if REALCASE == 1
+                                      h(i) - hs(1:nr)*hv(i)
+#endif
+#if COMPLEXCASE == 1
+                                      conjg(h(i)) - hs(1:nr)*conjg(hv(i))
+#endif
+                  enddo
+
+                else
+
+                  ! No new Householder transformation for nr=1, just complete the old one
+                  ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
+                  do i=2,nb
+#if REALCASE == 1
+                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
+#endif
+#if COMPLEXCASE == 1
+                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
+#endif
+                  enddo
+                  ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
+                  hv_t(1,my_thread) = 1.0_rck
+                endif
+
+              enddo
+
+            enddo ! my_thread
+!$omp end parallel do
+
+            call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
+
+            if (iter==1) then
+              ! We are at the end of the first block
+
+              ! Send our first column to previous PE
+              if (my_pe>0 .and. na_s <= na) then
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif
+                ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call mpi_isend(ab_s, int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                               int(my_pe-1,kind=MPI_KIND), 1_MPI_KIND, &
+                               int(communicator,kind=MPI_KIND), ireq_ab, mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+              endif
+
+              ! Request last column from next PE
+              ne = na_s + nblocks*nb - (max_threads-1) - 1
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+
+              if (istep>=max_threads .and. ne <= na) then
+                call mpi_recv(ab(1,ne-n_off), int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL,  &
+                              int(my_pe+1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), &
+                              MPI_STATUS_IGNORE, mpierr)
+              endif
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+              if (istep>=max_threads .and. ne <= na) then
+                ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
+              endif
+#endif /* WITH_MPI */
+            else
+              ! We are at the end of all blocks
+
+              ! Send last HH Vector and TAU to next PE if it has been calculated above
+              ne = na_s + nblocks*nb - (max_threads-1) - 1
+              if (istep>=max_threads .and. ne < na) then
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call mpi_wait(ireq_hv, MPI_STATUS_IGNORE, mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+                hv_s(1) = tau_t(max_threads)
+                hv_s(2:) = hv_t(2:,max_threads)
+
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call mpi_isend(hv_s, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                               int(my_pe+1,kind=MPI_KIND), 2_MPI_KIND, int(communicator,kind=MPI_KIND), &
+                               ireq_hv, mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+              endif
+
+              ! "Send" HH Vector and TAU to next OpenMP thread
+              do my_thread = max_threads, 2, -1
+                hv_t(:,my_thread) = hv_t(:,my_thread-1)
+                tau_t(my_thread)  = tau_t(my_thread-1)
+              enddo
+
+            endif
+          enddo ! iter
+
+        else
+
+          ! Codepath for 1 thread without OpenMP
+
+          ! The following code is structured in a way to keep waiting times for
+          ! other PEs at a minimum, especially if there is only one block.
+          ! For this reason, it requests the last column as late as possible
+          ! and sends the Householder Vector and the first column as early
+          ! as possible.
+
+#endif /* WITH_OPENMP */
+
+          do iblk=1,nblocks
+            ns = na_s + (iblk-1)*nb - n_off ! first column in block
+            ne = ns+nb-1                    ! last column in block
+
+            if (ns+n_off>na) exit
+
+            ! Store Householder Vector for back transformation
+
+            hh_cnt(iblk) = hh_cnt(iblk) + 1
+
+            hh_gath(1   ,hh_cnt(iblk),iblk) = tau
+            hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
+
+#ifndef WITH_OPENMP
+            if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
+              ! Wait for last transfer to finish
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+
+              call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+              ! Copy vectors into send buffer
+              hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
+              ! Send to destination
+
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+              call mpi_isend(hh_send(1,1,iblk), int(nb*hh_cnt(iblk),kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                             global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), &
+                             int(10+iblk,kind=MPI_KIND), int(communicator,kind=MPI_KIND), ireq_hhs(iblk), mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+             ! do the post-poned irecv here
+             startAddr = startAddr - hh_cnt(iblk)
+             hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
+#endif /* WITH_MPI */
+
+            ! Reset counter and increase destination row
+              hh_cnt(iblk) = 0
+              hh_dst(iblk) = hh_dst(iblk)+1
+            endif
+
+            ! The following code is structured in a way to keep waiting times for
+            ! other PEs at a minimum, especially if there is only one block.
+            ! For this reason, it requests the last column as late as possible
+            ! and sends the Householder Vector and the first column as early
+            ! as possible.
+#endif /* WITH_OPENMP */
+            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
+            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
+                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
+
+            ! Multiply diagonal block and subdiagonal block with Householder Vector
+
+            if (iblk==nblocks .and. nc==nb) then
+
+              ! We need the last column from the next PE.
+              ! First do the matrix multiplications without last column ...
+
+              ! Diagonal block, the contribution of the last element is added below!
+              ab(1,ne) = 0.0_rck
+              if (wantDebug) call obj%timer%start("blas")
+
+#if REALCASE == 1
+              if (isSkewsymmetric) then
+                hd(:) = 0.0_rk
+                call ELPA_PRECISION_SSMV(int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), hv, hd)
+              else
+                call PRECISION_SYMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
+                                    hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
+              endif
+#endif
+#if COMPLEXCASE == 1
+              call PRECISION_HEMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
+                                  hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
+#endif
+              ! Subdiagonal block
+              if (nr>0) call PRECISION_GEMV('N', int(nr,kind=BLAS_KIND), int(nb-1,kind=BLAS_KIND), &
+                                            tau, ab(nb+1,ns), int(2*nb-1,kind=BLAS_KIND), hv, 1_BLAS_KIND, &
+                                            ZERO, hs, 1_BLAS_KIND)
+              if (wantDebug) call obj%timer%stop("blas")
+
+              ! ... then request last column ...
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+#ifdef WITH_OPENMP
+              call mpi_recv(ab(1,ne), int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL,  &
+                            int(my_pe+1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), &
+                            MPI_STATUS_IGNORE, mpierr)
+#else /* WITH_OPENMP */
+              call mpi_recv(ab(1,ne), int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL,  &
+                            int(my_pe+1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), &
+                            MPI_STATUS_IGNORE, mpierr)
+#endif /* WITH_OPENMP */
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+
+              ab(1:nb+1,ne) = ab_s(1:nb+1)
+
+#endif /* WITH_MPI */
+
+              ! ... and complete the result
+              hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
+              hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
+
+            else
+
+              ! Normal matrix multiply
+              if (wantDebug) call obj%timer%start("blas")
+#if REALCASE == 1
+              if (isSkewsymmetric) then
+                hd(:) = 0.0_rk
+                call ELPA_PRECISION_SSMV(int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), hv, hd)
+              else
+                call PRECISION_SYMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
+                                    hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
+              endif
+#endif
+#if COMPLEXCASE == 1
+              call PRECISION_HEMV('L', int(nc,kind=BLAS_KIND), tau, ab(1,ns), int(2*nb-1,kind=BLAS_KIND), &
+                                  hv, 1_BLAS_KIND, ZERO, hd, 1_BLAS_KIND)
+#endif
+              if (nr>0) call PRECISION_GEMV('N', int(nr,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), tau, ab(nb+1,ns), &
+                                            int(2*nb-1,kind=BLAS_KIND), hv, 1_BLAS_KIND, ZERO, hs, 1_BLAS_KIND)
+              if (wantDebug) call obj%timer%stop("blas")
+            endif
+
+            ! Calculate first column of subdiagonal block and calculate new
+            ! Householder transformation for this column
+            hv_new(:) = 0.0_rck ! Needed, last rows must be 0 for nr < nb
+            tau_new = 0.0_rck
+            if (nr>0) then
+
+              ! complete (old) Householder transformation for first column
+
+              ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
+
+              ! calculate new Householder transformation ...
+              if (nr>1) then
+#if  REALCASE == 1
+                vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+                vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk8)**2+dimag(ab(nb+2:nb+nr,ns))**2)
+#else
+                vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk4)**2+aimag(ab(nb+2:nb+nr,ns))**2)
+#endif
+#endif /* COMPLEXCASE */
+
+                call hh_transform_&
+                &MATH_DATATYPE&
+                &_&
+                &PRECISION &
+                             (obj, ab(nb+1,ns), vnorm2, hf, tau_new, wantDebug)
+                hv_new(1) = 1.0_rck
+                hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
+                ab(nb+2:,ns) = 0.0_rck
+              endif ! nr > 1
+
+              ! ... and send it away immediatly if this is the last block
+
+              if (iblk==nblocks) then
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+#ifdef WITH_OPENMP
+                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
+#else
+                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
+#endif
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+                hv_s(1) = tau_new
+                hv_s(2:) = hv_new(2:)
+
+#ifdef WITH_MPI
+                if (wantDebug) call obj%timer%start("mpi_communication")
+                call mpi_isend(hv_s, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                               int(my_pe+1,kind=MPI_KIND), 2_MPI_KIND, int(communicator,kind=MPI_KIND), &
+                               ireq_hv, mpierr)
+                if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+              endif
+
+            endif
+
+            ! Transform diagonal block
+#if REALCASE == 1
+            if (.NOT. isSkewsymmetric) then
+              x = dot_product(hv(1:nc),hd(1:nc))*tau
+            endif
+#endif
+#if COMPLEXCASE == 1
+            x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
+#endif
+
+#if REALCASE == 1
+            if (.NOT. isSkewsymmetric) then
+#endif
+              hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc)
+#if REALCASE == 1
+            endif
+#endif
+            if (my_pe>0 .and. iblk==1) then
+
+              ! The first column of the diagonal block has to be send to the previous PE
+              ! Calculate first column only ...
+#if REALCASE == 1
+              if (isSkewsymmetric) then
+                ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) + hv(1:nc)*hd(1)
+              else
+                ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1)
+              endif
+#endif
+#if COMPLEXCASE == 1
+              ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1))
+#endif
+              ! ... send it away ...
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+              call mpi_wait(ireq_ab, MPI_STATUS_IGNORE, mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+              ab_s(1:nb+1) = ab(1:nb+1,ns)
+
+#ifdef WITH_MPI
+              if (wantDebug) call obj%timer%start("mpi_communication")
+
+              call mpi_isend(ab_s, int(nb+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                             int(my_pe-1,kind=MPI_KIND), 1_MPI_KIND, int(communicator,kind=MPI_KIND), &
+                             ireq_ab, mpierr)
+              if (wantDebug) call obj%timer%stop("mpi_communication")
+
+#endif /* WITH_MPI */
+              ! ... and calculate remaining columns with rank-2 update
+              if (wantDebug) call obj%timer%start("blas")
+#if REALCASE == 1
+              if (isSkewsymmetric) then 
+                if (nc>1) call ELPA_PRECISION_SSR2(int(nc-1,kind=BLAS_KIND), hd(2), hv(2), ab(1,ns+1), int(2*nb-1,kind=BLAS_KIND) )
+              else
+                if (nc>1) call PRECISION_SYR2('L', int(nc-1,kind=BLAS_KIND), -ONE, hd(2), 1_BLAS_KIND, &
+                                              hv(2), 1_BLAS_KIND, ab(1,ns+1), int(2*nb-1,kind=BLAS_KIND) )
+              endif
+#endif
+#if COMPLEXCASE == 1
+              if (nc>1) call PRECISION_HER2('L', int(nc-1,kind=BLAS_KIND), -ONE, hd(2), 1_BLAS_KIND, &
+                                            hv(2), 1_BLAS_KIND, ab(1,ns+1), int(2*nb-1,kind=BLAS_KIND) )
+#endif
+              if (wantDebug) call obj%timer%stop("blas")
+
+            else
+              ! No need to  send, just a rank-2 update
+              if (wantDebug) call obj%timer%start("blas")
+#if REALCASE == 1
+              if (isSkewsymmetric) then 
+                call ELPA_PRECISION_SSR2(int(nc,kind=BLAS_KIND), hd, hv, ab(1,ns), int(2*nb-1,kind=BLAS_KIND))
+              else
+                call PRECISION_SYR2('L', int(nc,kind=BLAS_KIND), -ONE, hd, 1_BLAS_KIND,  &
+                                    hv, 1_BLAS_KIND, ab(1,ns), int(2*nb-1,kind=BLAS_KIND) )
+              endif
+#endif
+#if COMPLEXCASE == 1
+              call PRECISION_HER2('L', int(nc,kind=BLAS_KIND), -ONE, hd, 1_BLAS_KIND, hv, 1_BLAS_KIND, &
+                                  ab(1,ns), int(2*nb-1,kind=BLAS_KIND))
+#endif
+              if (wantDebug) call obj%timer%stop("blas")
+
+            endif
+
+            ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
+
+            if (nr>0) then
+              if (nr>1) then
+                if (wantDebug) call obj%timer%start("blas")
+                call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, int(nr,kind=BLAS_KIND), int(nb-1,kind=BLAS_KIND), &
+                                    tau_new, ab(nb,ns+1), int(2*nb-1,kind=BLAS_KIND), &
+                                    hv_new, 1_BLAS_KIND, ZERO, h(2), 1_BLAS_KIND)
+                if (wantDebug) call obj%timer%stop("blas")
+
+                x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
+                h(2:nb) = h(2:nb) - x*hv(2:nb)
+                ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
+                do i=2,nb
+#if REALCASE == 1
+                  ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i)
+#endif
+#if COMPLEXCASE == 1
+                  ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
+#endif
+                enddo
+              else
+                ! No double Householder transformation for nr=1, just complete the row
+                do i=2,nb
+#if REALCASE == 1
+                  ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
+#endif
+#if COMPLEXCASE == 1
+                  ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
+#endif
+                enddo
+              endif
+            endif
+
+            ! Use new HH Vector for the next block
+            hv(:) = hv_new(:)
+            tau = tau_new
+
+          enddo
+
+#ifdef WITH_OPENMP
+        endif
+#endif
+
+#if WITH_OPENMP
+        do iblk = 1, nblocks
+
+          if (hh_dst(iblk) >= np_rows) exit
+          if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
+
+          if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
+            ! Wait for last transfer to finish
+#ifdef WITH_MPI
+            if (wantDebug) call obj%timer%start("mpi_communication")
+            call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
+            if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+            ! Copy vectors into send buffer
+            hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
+            ! Send to destination
+
+#ifdef WITH_MPI
+            if (wantDebug) call obj%timer%start("mpi_communication")
+            call mpi_isend(hh_send(1,1,iblk), int(nb*hh_cnt(iblk),kind=MPI_KIND), &
+                           MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                           global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), &
+                           int(10+iblk,kind=MPI_KIND), int(communicator,kind=MPI_KIND), ireq_hhs(iblk), mpierr)
+            if (wantDebug) call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+            ! do the post-poned irecv here
+            startAddr = startAddr - hh_cnt(iblk)
+            hh_trans(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
+#endif /* WITH_MPI */
+
+            ! Reset counter and increase destination row
+            hh_cnt(iblk) = 0
+            hh_dst(iblk) = hh_dst(iblk)+1
+          endif
+
+        enddo
+#endif /* WITH_OPENMP */
+      enddo ! istep
+
+      ! Finish the last outstanding requests
+
+#ifdef WITH_OPENMP
+
+#ifdef WITH_MPI
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
+      call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
+
+!      allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage)
+!      if (istat .ne. 0) then
+!        print *,"tridiag_band_real: error when allocating mpi_statuses"//errorMessage
+!        stop 1
+!      endif
+
+      call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
+      call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
+!      deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
+!      if (istat .ne. 0) then
+!        print *,"tridiag_band_real: error when deallocating mpi_statuses"//errorMessage
+!        stop 1
+!      endif
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+
+#else /* WITH_OPENMP */
+
+#ifdef WITH_MPI
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
+      call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
+
+      call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
+      call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+
+#endif /* WITH_OPENMP */
+
+#ifdef  WITH_MPI
+      if (wantDebug) call obj%timer%start("mpi_communication")
+      call mpi_barrier(int(communicator,kind=MPI_KIND),mpierr)
+      if (wantDebug) call obj%timer%stop("mpi_communication")
+#endif
+      deallocate(ab, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                &MATH_DATATYPE&
+                &: error when deallocating ab"//errorMessage
+        stop 1
+      endif
+
+      deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage)
+      if (istat .ne. 0) then
+        print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating ireq_hhr, ireq_hhs"//errorMessage
+        stop 1
+      endif
+
+      deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating hh_cnt, hh_dst"//errorMessage
+         stop 1
+       endif
+
+      deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating hh_gath, hh_send"//errorMessage
+         stop 1
+       endif
+
+      deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating limits, send_limits"//errorMessage
+         stop 1
+       endif
+
+      deallocate(block_limits, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"tridiag_band_&
+                 &MATH_DATATYPE&
+                 &: error when deallocating block_limits"//errorMessage
+         stop 1
+       endif
+
+      deallocate(global_id, stat=istat, errmsg=errorMessage)
+       if (istat .ne. 0) then
+         print *,"tridiag_band_&
+                  &MATH_DATATYPE&
+                  &: error when allocating global_id"//errorMessage
+         stop 1
+       endif
+
+      call obj%timer%stop("tridiag_band_&
+      &MATH_DATATYPE&
+      &" // &
+      &PRECISION_SUFFIX //&
+      gpuString)
+
+! intel compiler bug makes these ifdefs necessary
+#if REALCASE == 1
+    end subroutine tridiag_band_real_&
+#endif
+#if COMPLEXCASE == 1
+    end subroutine tridiag_band_complex_&
+#endif
+    &PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu elpa-2019.11.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu
--- elpa-2016.05.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,564 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+//
+// --------------------------------------------------------------------------------------------------
+//
+// This file was originally written by NVIDIA
+// and re-written by A. Marek, MPCDF
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
+#include <cuComplex.h>
+#include "config-f90.h"
+
+
+#define BLOCK_CYCLIC_BLOCKSIZE 128
+#define GLOBAL_STRIPE_WIDTH 256
+
+// ===========================================================================================================
+// Important:   due to the use of warp shuffling, the C version of the backtransformation kernel only works on
+//              devices with compute capability 3.x; for older devices, please use the Fortran kernel version
+// ===========================================================================================================
+
+
+#if 0
+static __device__ __forceinline__ cuDoubleComplex  shfl_xor_complex(cuDoubleComplex r, int mask)
+{
+    double real = cuCreal(r) ;
+    double imag =  cuCimag(r);
+
+
+    int hr = __shfl_xor(__double2hiint(real), mask);
+    int lr = __shfl_xor(__double2loint(real), mask);
+
+    int hi = __shfl_xor(__double2hiint(imag), mask);
+    int li = __shfl_xor(__double2loint(imag), mask);
+
+
+
+    real =      __hiloint2double(hr, lr);
+    imag = __hiloint2double(hi, li);
+    return       make_cuDoubleComplex(real, imag);
+
+}
+#endif
+
+
+// Perform the equivalent of "__shfl_down" on an 8-byte value
+#ifdef DOUBLE_PRECISION_COMPLEX
+static __device__ __forceinline__ double shfl_down_complex_double(double r, int offset)
+#else
+static __device__ __forceinline__ float shfl_down_complex_single(float r, int offset)
+#endif
+{
+    // The following operations do not exist in CUDA 10.1 any more
+    // It has been commented out. The code is still compiled, but not used
+    // TODO do it properly
+
+    assert(0);
+    //int hi = __shfl_down(__double2hiint(r), offset);
+    //int lo = __shfl_down(__double2loint(r), offset);
+
+    //return __hiloint2double(hi, lo);
+    return 0.;
+}
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__ void warp_reduce_1_complex_double( cuDoubleComplex *s_block)
+#else
+__device__ void warp_reduce_1_complex_single( cuFloatComplex *s_block)
+#endif
+{
+    int t_idx ;
+    t_idx = threadIdx.x;
+    __syncthreads();
+    // attention
+#ifdef DOUBLE_PRECISION_COMPLEX
+        if (t_idx < 32)
+        {
+
+	s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 32]) , cuCadd( s_block[t_idx + 64], s_block[t_idx + 96]) );
+        if (t_idx < 8)
+        {
+	s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
+
+        }
+        if (t_idx < 4)
+        {
+        s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ;
+        }
+        if (t_idx < 1)
+        {
+	s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) );
+        }
+        }
+#else
+        if (t_idx < 32)
+        {
+
+	s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 32]) , cuCaddf( s_block[t_idx + 64], s_block[t_idx + 96]) );
+        if (t_idx < 8)
+        {
+	s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 8] ) , cuCaddf( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
+
+        }
+        if (t_idx < 4)
+        {
+        s_block[t_idx] = cuCaddf(s_block[t_idx] , s_block[t_idx + 4]) ;
+        }
+        if (t_idx < 1)
+        {
+	s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 1] ) , cuCaddf( s_block[t_idx +2] , s_block[t_idx + 3] ) );
+        }
+        }
+#endif
+}
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__ void warp_reduce_2_complex_double( cuDoubleComplex *s_block)
+#else
+__device__ void warp_reduce_2_complex_single( cuFloatComplex *s_block)
+#endif
+{
+    int t_idx ;
+    t_idx = threadIdx.x;
+    __syncthreads();
+        // attention
+#ifdef DOUBLE_PRECISION_COMPLEX
+        if(t_idx < 64)
+        {
+	s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 64]) , cuCadd( s_block[t_idx + 128], s_block[t_idx + 192]) );
+        if (t_idx < 32)
+        {
+        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 32]) , cuCadd( s_block[t_idx + 64], s_block[t_idx + 96]) );
+        }
+        if (t_idx < 8)
+        {
+        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
+
+        }
+        if (t_idx < 4)
+        {
+        s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ;
+        }
+        if (t_idx < 1)
+        {
+        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) );
+        }
+        }
+#else
+        if(t_idx < 64)
+        {
+	s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 64]) , cuCaddf( s_block[t_idx + 128], s_block[t_idx + 192]) );
+        if (t_idx < 32)
+        {
+        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 32]) , cuCaddf( s_block[t_idx + 64], s_block[t_idx + 96]) );
+        }
+        if (t_idx < 8)
+        {
+        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 8] ) , cuCaddf( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
+
+        }
+        if (t_idx < 4)
+        {
+        s_block[t_idx] = cuCaddf(s_block[t_idx] , s_block[t_idx + 4]) ;
+        }
+        if (t_idx < 1)
+        {
+        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 1] ) , cuCaddf( s_block[t_idx +2] , s_block[t_idx + 3] ) );
+        }
+        }
+
+#endif
+}
+
+template <unsigned int REDUCE_START_OFFSET>
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__ __forceinline__ cuDoubleComplex warp_reduce_complex_double( cuDoubleComplex r)
+#else
+__device__ __forceinline__ cuFloatComplex warp_reduce_complex_single( cuFloatComplex r)
+#endif
+{
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     double real = cuCreal(r);
+     double imag = cuCimag(r);
+#else
+     float real = cuCrealf(r);
+     float imag = cuCimagf(r);
+#endif
+
+#pragma unroll
+    for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
+    {
+#ifdef DOUBLE_PRECISION_COMPLEX
+        real += shfl_down_complex_double(real, i);
+#else
+        real += shfl_down_complex_single(real, i);
+#endif
+    }
+#pragma unroll
+    for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
+    {
+#ifdef DOUBLE_PRECISION_COMPLEX
+        imag += shfl_down_complex_double(imag, i);
+#else
+        imag += shfl_down_complex_single(imag, i);
+#endif
+    }
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+    return make_cuDoubleComplex(real,imag);
+#else
+    return make_cuFloatComplex(real,imag);
+#endif
+}
+
+#if 0 /* not used anywhere */
+template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__ __forceinline__ void driver_warp_reduce_complex_double(cuDoubleComplex * dotp_s, int w_off)
+#else
+__device__ __forceinline__ void driver_warp_reduce_complex_single(cuFloatComplex * dotp_s, int w_off)
+#endif
+{
+    int t_idx = threadIdx.x;
+
+    if (HAVE_2_WARPS)
+    {
+        // In this case, we have 2 warps, each doing 1 reduction
+	//attention
+        if (t_idx < 64)
+        {
+#ifdef DOUBLE_PRECISION_COMPLEX
+            dotp_s[w_off + t_idx] = warp_reduce_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[w_off + t_idx] , dotp_s[w_off + t_idx + 32]));
+#else
+            dotp_s[w_off + t_idx] = warp_reduce_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[w_off + t_idx] , dotp_s[w_off + t_idx + 32]));
+#endif
+        }
+    }
+    else
+    {
+        // In this case we have 1 warp that performs both reductions
+	// attention
+        if (t_idx < 32)
+        {
+#ifdef DOUBLE_PRECISION_COMPLEX
+            dotp_s[t_idx] = warp_reduce_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[t_idx] ,  dotp_s[t_idx + 32]));
+            dotp_s[t_idx + 64] = warp_reduce_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[t_idx + 64] ,  dotp_s[t_idx + 96]));
+#else
+            dotp_s[t_idx] = warp_reduce_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[t_idx] ,  dotp_s[t_idx + 32]));
+            dotp_s[t_idx + 64] = warp_reduce_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[t_idx + 64] ,  dotp_s[t_idx + 96]));
+#endif
+        }
+    }
+}
+#endif /* not used anywhere */
+
+
+#ifndef ALREADY_DEFINED_SYNC
+// Synchronization wrapper, removing explicit synchronization when the thread-block is at most 32 threads (1 warp) in size
+template <bool MUST_SYNC>
+__device__ __forceinline__ void sync_real_threads()
+{
+    if (MUST_SYNC)
+    {
+        __syncthreads();
+    }
+}
+#define ALREADY_DEFINED_SYNC 1
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__  void reset_dotp_buffers_complex_double( cuDoubleComplex  * const __restrict__ s_block)
+#else
+__device__  void reset_dotp_buffers_complex_single( cuFloatComplex  * const __restrict__ s_block)
+#endif
+{
+    // attention
+    if (blockDim.x >= 64)
+    {
+        int t_idx = threadIdx.x;
+
+        if (t_idx < 64)
+        {
+            s_block[t_idx].x = s_block[t_idx + 64].x = 0.0;
+	    s_block[t_idx].y = s_block[t_idx + 64].y = 0.0;
+
+        }
+    }
+    else
+    {
+        int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
+#ifdef DOUBLE_PRECISION_COMPLEX
+        int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
+#else
+        int s_chunk_size = s_chunk * sizeof(cuFloatComplex);
+#endif
+
+        // Each thread resets an equally-sized, contiguous portion of the buffer
+        memset(&(s_block[ threadIdx.x * s_chunk].x), 0, s_chunk_size);
+	memset( & (s_block[ threadIdx.x * s_chunk].y), 0, s_chunk_size);
+
+    }
+}
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__  void reset_dotp_buffers_2_complex_double( cuDoubleComplex  * const __restrict__ s_block)
+#else
+__device__  void reset_dotp_buffers_2_complex_single( cuFloatComplex  * const __restrict__ s_block)
+#endif
+{
+    if (blockDim.x >= BLOCK_CYCLIC_BLOCKSIZE)
+    {
+        int t_idx = threadIdx.x;
+
+        if (t_idx < BLOCK_CYCLIC_BLOCKSIZE)
+        {
+            s_block[t_idx].x = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].x = 0.0;
+            s_block[t_idx].y = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].y = 0.0;
+
+        }
+    }
+    else
+    {
+        int s_chunk = GLOBAL_STRIPE_WIDTH / blockDim.x;
+#ifdef DOUBLE_PRECISION_COMPLEX
+        int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
+#else
+        int s_chunk_size = s_chunk * sizeof(cuFloatComplex);
+#endif
+        // Each thread resets an equally-sized, contiguous portion of the buffer
+        memset(&(s_block[ threadIdx.x * s_chunk].x), 0, s_chunk_size);
+        memset( & (s_block[ threadIdx.x * s_chunk].y), 0, s_chunk_size);
+
+    }
+}
+
+
+// =========================
+// Backtransformation kernel
+// =========================
+#ifdef DOUBLE_PRECISION_COMPLEX
+template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_kernel_2_2_complex_double(cuDoubleComplex * const __restrict__  q, const cuDoubleComplex  * const __restrict__   hh,   const cuDoubleComplex * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
+#else
+template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_kernel_2_2_complex_single(cuFloatComplex * const __restrict__  q, const cuFloatComplex  * const __restrict__   hh,   const cuFloatComplex * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
+#endif
+{
+#ifdef DOUBLE_PRECISION_COMPLEX
+    __shared__ cuDoubleComplex q_s[BLOCK_CYCLIC_BLOCKSIZE];
+    __shared__ cuDoubleComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
+
+     cuDoubleComplex q_v2, tau ;
+#else
+    __shared__ cuFloatComplex q_s[BLOCK_CYCLIC_BLOCKSIZE];
+    __shared__ cuFloatComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
+
+     cuFloatComplex q_v2, tau ;
+#endif
+
+    int  t_idx,q_off, h_off, j , b_idx;
+
+    // The thread index selects the position inside the eigenvector selected above
+    t_idx = threadIdx.x;
+    b_idx = blockIdx.x ;
+
+    // Compute intial index
+    j = ncols ;
+     q_off = b_idx + (j + t_idx) * ldq;
+         h_off = j * nb + t_idx;
+
+   if(t_idx>0)
+   {    q_s[t_idx] = q[ q_off ];
+   }
+
+   while (j>=1)
+   {
+
+        if ((j == ncols) || (t_idx ==0))
+        {
+              q_s[t_idx] = q[q_off ];
+        }
+
+        q_v2 = q_s[t_idx];
+       tau =  hh_tau[j];
+
+        __syncthreads();
+
+        if(t_idx==0)
+        {
+                dotp_s[t_idx]= q_v2  ;
+        }
+       else
+        {
+#ifdef DOUBLE_PRECISION_COMPLEX
+		dotp_s[t_idx]  =  cuCmul(q_v2,cuConj( hh[h_off]));
+#else
+		dotp_s[t_idx]  =  cuCmulf(q_v2,cuConjf( hh[h_off]));
+#endif
+        }
+#ifdef DOUBLE_PRECISION_COMPLEX
+        warp_reduce_1_complex_double( dotp_s);
+#else
+        warp_reduce_1_complex_single( dotp_s);
+#endif
+
+        __syncthreads();
+        if(t_idx ==0)
+        {
+#ifdef DOUBLE_PRECISION_COMPLEX
+		q_v2 =  cuCsub(q_v2,cuCmul(dotp_s[0], tau) );
+#else
+		q_v2 =  cuCsubf(q_v2,cuCmulf(dotp_s[0], tau) );
+#endif
+        }
+        else
+        {
+#ifdef DOUBLE_PRECISION_COMPLEX
+		q_v2 =  cuCsub(q_v2,cuCmul(cuCmul(dotp_s[0], tau),hh[h_off]));
+#else
+		q_v2 =  cuCsubf(q_v2,cuCmulf(cuCmulf(dotp_s[0], tau),hh[h_off]));
+#endif
+        }
+
+        if(t_idx < blockDim.x-1)
+       {q_s[t_idx+1 ] = q_v2;
+        }
+       if ((j ==  1) || (t_idx == blockDim.x-1))
+       {q[q_off] = q_v2;
+        }
+       __syncthreads();
+       q_off -= ldq;
+       h_off -= nb;
+	j -=1;
+}
+}
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_compute_hh_trafo_c_kernel_complex_double( cuDoubleComplex* q, cuDoubleComplex * hh, cuDoubleComplex * hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
+#else
+extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex* q, cuFloatComplex * hh, cuFloatComplex * hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
+#endif
+{
+
+#if 0
+	cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to compute_ hh_ trafo c kernel: %s, %d\n",cudaGetErrorString(err), err);
+        dim3 n_block, n_thread;
+	n_block = dim3(nev,1,1);
+	n_thread = dim3(nb,1,1);
+#endif
+
+    switch (nb)
+    {
+      // attention
+      case  256:
+       case 128:
+        case 64:
+#ifdef DOUBLE_PRECISION_COMPLEX
+	     compute_hh_trafo_kernel_2_2_complex_double<16><<<nev, nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#else
+	     compute_hh_trafo_kernel_2_2_complex_single<16><<<nev, nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 32:
+#ifdef DOUBLE_PRECISION_COMPLEX
+            compute_hh_trafo_kernel_2_2_complex_double<8><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_2_2_complex_single<8><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 16:
+#ifdef DOUBLE_PRECISION_COMPLEX
+            compute_hh_trafo_kernel_2_2_complex_double<4><<<nev ,nb>>>(q, hh,  hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_2_2_complex_single<4><<<nev ,nb>>>(q, hh,  hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 8:
+#ifdef DOUBLE_PRECISION_COMPLEX
+            compute_hh_trafo_kernel_2_2_complex_double<2><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_2_2_complex_single<2><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 4:
+#ifdef DOUBLE_PRECISION_COMPLEX
+            compute_hh_trafo_kernel_2_2_complex_double<1><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_2_2_complex_single<1><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 2:
+        case 1:
+#ifdef DOUBLE_PRECISION_COMPLEX
+	    compute_hh_trafo_kernel_2_2_complex_double<0><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#else
+	    compute_hh_trafo_kernel_2_2_complex_single<0><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+        default:
+            printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE.\n");
+    }
+
+#if 0
+	cudaDeviceSynchronize();
+	 err = cudaGetLastError();
+        if ( err!= cudaSuccess)
+        {
+                printf("\n compute hh trafo c kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+#endif
+
+}
+
+
diff -Nru elpa-2016.05.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu elpa-2019.11.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu
--- elpa-2016.05.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2.cu	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,78 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+//
+// --------------------------------------------------------------------------------------------------
+//
+// This file was originally written by NVIDIA
+// and re-written by A. Marek, MPCDF
+
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
+#include <cuComplex.h>
+#include "config-f90.h"
+
+//the complex part
+#define DOUBLE_PRECISION_COMPLEX 1
+#include "ev_tridi_band_gpu_c_v2_complex_template.cu"
+#undef DOUBLE_PRECISION_COMPLEX
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#undef DOUBLE_PRECISION_COMPLEX
+#include "ev_tridi_band_gpu_c_v2_complex_template.cu"
+#endif
+
+
+//the real part
+#define DOUBLE_PRECISION_REAL 1
+#include "ev_tridi_band_gpu_c_v2_real_template.cu"
+#undef DOUBLE_PRECISION_REAL
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#undef DOUBLE_PRECISION_REAL
+#include "ev_tridi_band_gpu_c_v2_real_template.cu"
+#endif
+
diff -Nru elpa-2016.05.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu elpa-2019.11.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu
--- elpa-2016.05.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,455 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+//
+// --------------------------------------------------------------------------------------------------
+//
+// This file was originally written by NVIDIA
+// and re-written by A. Marek, MPCDF
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
+#include "config-f90.h"
+
+#define BLOCK_CYCLIC_BLOCKSIZE 128
+#define GLOBAL_STRIPE_WIDTH 256
+
+// Perform the equivalent of "__shfl_xor" on an 8-byte value
+#ifdef DOUBLE_PRECISION_REAL
+static __device__ __forceinline__ double shfl_xor_real_double(double r, int mask)
+#else
+static __device__ __forceinline__ float shfl_xor_real_single(float r, int mask)
+#endif
+{
+    // The following operations do not exist in CUDA 10.1 any more
+    // It has been commented out. The code is still compiled, but not used
+    // TODO do it properly
+    assert(0);
+
+//    int hi = __shfl_xor(__double2hiint(r), mask);
+//    int lo = __shfl_xor(__double2loint(r), mask);
+//
+//    return __hiloint2double(hi, lo);
+    return 0.;
+}
+
+// Perform the equivalent of "__shfl_down" on an 8-byte value
+#ifdef DOUBLE_PRECISION_REAL
+static __device__ __forceinline__ double shfl_down_real_double(double r, int offset)
+#else
+static __device__ __forceinline__ float shfl_down_real_single(float r, int offset)
+#endif
+{
+    // The following operations do not exist in CUDA 10.1 any more
+    // It has been commented out. The code is still compiled, but not used
+    // TODO do it properly
+    assert(0);
+
+//    int hi = __shfl_down(__double2hiint(r), offset);
+//    int lo = __shfl_down(__double2loint(r), offset);
+//
+//    return __hiloint2double(hi, lo);
+    return 0.;
+}
+
+// Perform a reduction on a warp or the first part of it
+template <unsigned int REDUCE_START_OFFSET>
+#ifdef DOUBLE_PRECISION_REAL
+__device__ __forceinline__ double warp_reduce_real_double(double r)
+#else
+__device__ __forceinline__ float warp_reduce_real_single(float r)
+#endif
+{
+#pragma unroll
+    for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
+    {
+#ifdef DOUBLE_PRECISION_REAL
+        r += shfl_down_real_double(r, i);
+#else
+        r += shfl_down_real_single(r, i);
+#endif
+    }
+
+    return r;
+}
+
+// Perform 2 reductions, using either 1 or 2 warps
+template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
+#ifdef DOUBLE_PRECISION_REAL
+__device__ __forceinline__ void double_warp_reduce_real_double(double * dotp_s, int w_off)
+#else
+__device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, int w_off)
+#endif
+{
+    int t_idx = threadIdx.x;
+
+    if (HAVE_2_WARPS)
+    {
+        // In this case, we have 2 warps, each doing 1 reduction
+        // attention
+        if (t_idx < 64)
+        {
+#ifdef DOUBLE_PRECISION_REAL
+            dotp_s[w_off + t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
+#else
+            dotp_s[w_off + t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
+#endif
+        }
+    }
+    else
+    {
+        // In this case we have 1 warp that performs both reductions
+        // attention
+        if (t_idx < 32)
+        {
+#ifdef DOUBLE_PRECISION_REAL
+            dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
+            dotp_s[t_idx + 64] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
+#else
+            dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
+            dotp_s[t_idx + 64] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
+#endif
+        }
+    }
+}
+
+// Reset the entire contents of a shared reduction block; the thread block size must be a power-of-2
+#ifdef DOUBLE_PRECISION_REAL
+__device__ __forceinline__ void reset_dotp_buffers_real_double(double * const __restrict__ s_block)
+#else
+__device__ __forceinline__ void reset_dotp_buffers_real_single(float * const __restrict__ s_block)
+#endif
+{
+    // attention
+    if (blockDim.x >= 64)
+    {
+        int t_idx = threadIdx.x;
+
+        if (t_idx < 64)
+        {
+            s_block[t_idx] = s_block[t_idx + 64] = 0.0;
+        }
+    }
+    else
+    {
+        int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
+#ifdef DOUBLE_PRECISION_REAL
+        int s_chunk_size = s_chunk * sizeof(double);
+#else
+        int s_chunk_size = s_chunk * sizeof(float);
+#endif
+        // Each thread resets an equally-sized, contiguous portion of the buffer
+        memset(s_block + threadIdx.x * s_chunk, 0, s_chunk_size);
+    }
+}
+
+// =========================
+// Backtransformation kernel
+// =========================
+
+// We use templates here to avoid additional branching based on the actual size of the thread-block
+template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
+#ifdef DOUBLE_PRECISION_REAL
+__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_kernel_real_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
+    const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
+#else
+__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_kernel_real_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
+    const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
+#endif
+
+{
+#ifdef DOUBLE_PRECISION_REAL
+    __shared__ double dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
+    __shared__ double q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
+#else
+    __shared__ float dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
+    __shared__ float q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
+#endif
+
+    int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta;
+#ifdef DOUBLE_PRECISION_REAL
+    double q_v_1, q_v_2, hh_v_1, hh_v_2, tau_1, tau_2, s_1, s_2, dot_p, hh_v_3, my_r1, my_r2;
+#else
+    float q_v_1, q_v_2, hh_v_1, hh_v_2, tau_1, tau_2, s_1, s_2, dot_p, hh_v_3, my_r1, my_r2;
+#endif
+    // The block index selects the eigenvector (EV) which the current block is responsible for
+    b_idx = blockIdx.x;
+
+    // The thread index selects the position inside the eigenvector selected above
+    t_idx = threadIdx.x;
+
+    // The warp offset for the current thread: 0 for the first warp, 32 for the second etc.
+    w_off = (t_idx >> 5) << 5;
+
+    // The entire contents of the shared reduction buffers must be reset
+
+#ifdef DOUBLE_PRECISION_REAL
+   reset_dotp_buffers_real_double(dotp_s);
+#else
+    reset_dotp_buffers_real_single(dotp_s);
+#endif
+
+    // Compute initial access indices
+    j = off + ncols - 1;
+    q_off = b_idx + (j + t_idx) * ldq;
+    h_off = j * nb + t_idx;
+    t_s = t_idx >> 1;
+    q_delta = ldq << 1;
+    hh_delta = nb << 1;
+
+    // Load the last EV components in the EV cache
+    if (t_idx > 0)
+    {
+        q_s[t_idx + 1] = q[q_off];
+    }
+
+    // Ensure the ring buffer and reduction buffers are initialized
+    sync_real_threads<HAVE_2_WARPS>();
+
+    while (j >= off + 1)
+    {
+        // Per-iteration GMem I/O reads are in order to improve cache hit ratio
+
+        // Read the corresponding compotents in the 2 Householder reflectors
+        hh_v_1 = __ldg(&hh[h_off]);
+        hh_v_2 = __ldg(&hh[h_off - nb]);
+        hh_v_3 = (t_idx == 0)? 0.0 : __ldg(&hh[h_off - 1]);
+
+        // Read the pre-computed dot-product of the 2 Householder reflectors
+        dot_p = __ldg(&hh_dot[j - 1]);
+
+        // Read the pre-computed values for "Tau" corresponding to the 2 Householder reflectors
+        tau_1 = __ldg(&hh_tau[j]);
+        tau_2 = __ldg(&hh_tau[j - 1]);
+
+        // Only read the new EV components (the others are already stored in the shared EV cache, q_s)
+        if (t_idx == 0)
+        {
+            q_s[0] = q[q_off - ldq];
+            q_s[1] = q[q_off];
+        }
+
+        // Fill the shared buffers for the dot products bewtween the EV subset and the Householder reflectors
+        q_v_1 = q_s[t_idx + 1];
+        q_v_2 = q_s[t_idx];
+
+        my_r1 = q_v_1 * hh_v_1 * tau_1;
+        my_r2 = q_v_2 * hh_v_2 * tau_2;
+
+        // After using "shfl_xor", both threads in a pair will hold the same values
+#ifdef DOUBLE_PRECISION_REAL
+        my_r1 += shfl_xor_real_double(my_r1, 1);
+        my_r2 += shfl_xor_real_double(my_r2, 1);
+#else
+        my_r1 += shfl_xor_real_single(my_r1, 1);
+        my_r2 += shfl_xor_real_single(my_r2, 1);
+#endif
+
+        // Now both threads in a pair can write to the same reduction buffer address without race-condition issues
+        dotp_s[t_s] = my_r1;
+	//attention
+        dotp_s[t_s + 64] = my_r2;
+
+        // Ensure the reduction buffers are fully populated
+        sync_real_threads<HAVE_2_WARPS>();
+
+        // Perform the 2 reductions using only the first warp (we assume the warp size is 32, valid up to CC 3.x)
+#ifdef DOUBLE_PRECISION_REAL
+        double_warp_reduce_real_double<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
+#else
+        float_warp_reduce_real_single<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
+#endif
+        // Ensure every thread will have access to the reduction results
+        sync_real_threads<HAVE_2_WARPS>();
+
+        // Each thread collects the reduction results
+        s_1 = dotp_s[0];
+
+	// attention
+        s_2 = dotp_s[64];
+
+        // Each thread updates its corresponding EV component
+        q_v_2 = q_v_2 - hh_v_3 * s_1 - hh_v_2 * s_2 + tau_2 * hh_v_2 * s_1 * dot_p;
+
+        if (t_idx == blockDim.x - 1)
+        {
+            // The last thread writes the last 2 EV components to the EV matrix
+            q[q_off] = q_v_1 - hh_v_1 * s_1;
+            q[q_off - ldq] = q_v_2;
+        }
+        else
+        {
+            // All other threads update the EV cache for the next iteration
+            q_s[t_idx + 2] = q_v_2;
+        }
+
+        sync_real_threads<HAVE_2_WARPS>();
+
+        // Update access indices
+        q_off -= q_delta;
+        h_off -= hh_delta;
+        j -= 2;
+    }
+
+    // Once the previous loop has finished, we have at most 1 more iteration to perform
+
+    if (j == off - 1)
+    {
+        // No iterations remain, so the final contents of the EV matrix are updated
+        if (t_idx < blockDim.x - 1)
+        {
+            q[q_off + ldq] = q_v_2;
+        }
+    }
+    else
+    {
+        // One iteration remains; it must be processed separately
+        if (t_idx == 0)
+        {
+            // Only one more EV element needs to be loaded
+            q_s[1] = q[q_off];
+        }
+
+        // As before, we first read the EV and Householder components
+        q_v_1 = q_s[t_idx + 1];
+        hh_v_1 = __ldg(&hh[h_off]);
+        tau_1 = __ldg(&hh_tau[j]);
+
+        // We prepare the reduction buffer
+        my_r1 = q_v_1 * hh_v_1 * tau_1;
+#ifdef DOUBLE_PRECISION_REAL
+        my_r1 += shfl_xor_real_double(my_r1, 1);
+#else
+        my_r1 += shfl_xor_real_single(my_r1, 1);
+#endif
+        dotp_s[t_s] = my_r1;
+
+        sync_real_threads<HAVE_2_WARPS>();
+
+        // We perform the reduction using the first warp only
+	// attention
+        if (t_idx < 32)
+        {
+#ifdef DOUBLE_PRECISION_REAL
+            dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
+#else
+            dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
+#endif
+        }
+
+        sync_real_threads<HAVE_2_WARPS>();
+
+        // The last EV components are written to the EV matrix
+        q[q_off] = q_v_1 - hh_v_1 * dotp_s[0];
+    }
+}
+
+// This is a host wrapper for calling the appropriate back-transformation kernel, based on the SCALAPACK block size
+#ifdef DOUBLE_PRECISION_REAL
+ extern "C" void launch_compute_hh_trafo_c_kernel_real_double(double * const q, const double * const hh, const double * const hh_dot,  const double * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
+#else
+ extern "C" void launch_compute_hh_trafo_c_kernel_real_single(float * const q, const float * const hh, const float * const hh_dot,  const float * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
+#endif
+{
+    switch (nb)
+    {
+        // attention
+        case 128:
+        case 64:
+#ifdef DOUBLE_PRECISION_REAL
+            compute_hh_trafo_kernel_real_double<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_real_single<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 32:
+#ifdef DOUBLE_PRECISION_REAL
+            compute_hh_trafo_kernel_real_double<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_real_single<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 16:
+#ifdef DOUBLE_PRECISION_REAL
+            compute_hh_trafo_kernel_real_double<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_real_single<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 8:
+#ifdef DOUBLE_PRECISION_REAL
+            compute_hh_trafo_kernel_real_double<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_real_single<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 4:
+#ifdef DOUBLE_PRECISION_REAL
+            compute_hh_trafo_kernel_real_double<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_real_single<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        case 2:
+        case 1:
+#ifdef DOUBLE_PRECISION_REAL
+            compute_hh_trafo_kernel_real_double<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#else
+            compute_hh_trafo_kernel_real_single<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
+#endif
+            break;
+
+        default:
+            printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE .\n");
+    }
+}
+
diff -Nru elpa-2016.05.001/src/elpa2/GPU/interface_c_kernel.F90 elpa-2019.11.001/src/elpa2/GPU/interface_c_kernel.F90
--- elpa-2016.05.001/src/elpa2/GPU/interface_c_kernel.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/GPU/interface_c_kernel.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1186 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+
+!This is a module contains all CUDA C Calls
+! it was provided by NVIDIA with their ELPA GPU port and
+! adapted for an ELPA release by A.Marek, RZG
+
+#include "config-f90.h"
+
+module cuda_c_kernel
+  implicit none
+
+#if 0 /* not used anywhere */
+  interface
+    subroutine launch_dot_product_kernel_c_complex_double(hs_dev, hv_new_dev, tau_new, x_dev, h_dev,hv_dev, nr) &
+               bind(c,name="launch_dot_product_kernel_complex_double")
+      use precision
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: nr
+      integer(kind=C_intptr_T), value :: hs_dev ,hv_new_dev,x_dev,h_dev, hv_dev
+      complex(kind=ck8),value         :: tau_new
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  interface
+    subroutine launch_dot_product_kernel_c_complex_single(hs_dev, hv_new_dev, tau_new, x_dev, h_dev,hv_dev, nr) &
+               bind(c,name="launch_dot_product_kernel_complex_single")
+      use precision
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: nr
+      integer(kind=C_intptr_T), value :: hs_dev ,hv_new_dev,x_dev,h_dev, hv_dev
+      complex(kind=ck4),value         :: tau_new
+
+    end subroutine
+  end interface
+
+#endif
+
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+
+  interface
+    subroutine launch_dot_product_kernel_1_c_complex_double(ab_dev, hs_dev, hv_new_dev, x_dev,h_dev,hv_dev,nb, nr, ns) &
+               bind(c, name="launch_dot_product_kernel_1_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value        ::  nb, nr, ns
+      integer(kind=C_intptr_T), value   :: x_dev,h_dev, hv_dev, ab_dev, hs_dev,hv_new_dev
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+  interface
+    subroutine launch_dot_product_kernel_1_c_complex_single(ab_dev, hs_dev, hv_new_dev, x_dev,h_dev,hv_dev,nb, nr, ns) &
+               bind(c, name="launch_dot_product_kernel_1_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value        ::  nb, nr, ns
+      integer(kind=C_intptr_T), value   :: x_dev,h_dev, hv_dev, ab_dev, hs_dev,hv_new_dev
+
+    end subroutine
+  end interface
+
+#endif
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+
+  interface
+    subroutine launch_dot_product_kernel_2_c_complex_double(ab_dev, hs_dev, hv_dev,hd_dev,nb, nr, ne) &
+                bind(c,name="launch_dot_product_kernel_2_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      ::  nb, nr, ne
+      integer(kind=C_intptr_T), value :: hd_dev,hv_dev, hs_dev, ab_dev
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+  interface
+    subroutine launch_dot_product_kernel_2_c_complex_single(ab_dev, hs_dev, hv_dev,hd_dev,nb, nr, ne) &
+                bind(c,name="launch_dot_product_kernel_2_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      ::  nb, nr, ne
+      integer(kind=C_intptr_T), value :: hd_dev,hv_dev, hs_dev, ab_dev
+
+    end subroutine
+  end interface
+
+#endif
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+  interface
+    subroutine launch_double_hh_transform_1_c_complex_double(ab_dev, hs_dev,hv_dev,nb,ns) &
+               bind(c,name="launch_double_hh_transform_1_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      ::  nb, ns
+      integer(kind=C_intptr_T), value :: hv_dev, ab_dev,hs_dev
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+  interface
+    subroutine launch_double_hh_transform_1_c_complex_single(ab_dev, hs_dev,hv_dev,nb,ns) &
+               bind(c,name="launch_double_hh_transform_1_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      ::  nb, ns
+      integer(kind=C_intptr_T), value :: hv_dev, ab_dev,hs_dev
+
+    end subroutine
+  end interface
+
+#endif
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+  interface
+    subroutine launch_double_hh_transform_2_c_complex_double(ab_dev, hd_dev,hv_dev,nc,ns, nb) &
+               bind(c,name="launch_double_hh_transform_2_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      ::  nc, ns, nb
+      integer(kind=C_intptr_T), value :: hv_dev, ab_dev,hd_dev
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+  interface
+    subroutine launch_double_hh_transform_2_c_complex_single(ab_dev, hd_dev,hv_dev,nc,ns, nb) &
+               bind(c,name="launch_double_hh_transform_2_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      ::  nc, ns, nb
+      integer(kind=C_intptr_T), value :: hv_dev, ab_dev,hd_dev
+    end subroutine
+  end interface
+
+#endif
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+  interface
+    subroutine launch_compute_kernel_reduce_c_complex_double(a_dev, lda, n, nbw, h1_dev) &
+               bind(c,name="launch_compute_kernel_reduce_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: n,lda,nbw
+      integer(kind=C_intptr_T), value :: h1_dev ,a_dev
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+  interface
+    subroutine launch_compute_kernel_reduce_c_complex_single(a_dev, lda, n, nbw, h1_dev) &
+               bind(c,name="launch_compute_kernel_reduce_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: n,lda,nbw
+      integer(kind=C_intptr_T), value :: h1_dev ,a_dev
+    end subroutine
+  end interface
+#endif
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+  interface
+    subroutine launch_compute_kernel_reduce_1_c_complex_double(a_dev, lda, n, h1_dev) &
+               bind(c,name="launch_compute_kernel_reduce_1_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: n,lda
+      integer(kind=C_intptr_T), value :: h1_dev ,a_dev
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  interface
+    subroutine launch_compute_kernel_reduce_1_c_complex_single(a_dev, lda, n, h1_dev) &
+               bind(c,name="launch_compute_kernel_reduce_1_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: n,lda
+      integer(kind=C_intptr_T), value :: h1_dev ,a_dev
+
+    end subroutine
+  end interface
+#endif
+#endif /* not used anywhere */
+
+  interface
+    subroutine launch_compute_hh_trafo_c_kernel_real_double(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols) &
+               bind(c,name="launch_compute_hh_trafo_c_kernel_real_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value     :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t), value  :: q
+      integer(kind=c_intptr_t), value  :: hh_dot
+      integer(c_intptr_t), value       :: hh_tau ,hh
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  interface
+    subroutine launch_compute_hh_trafo_c_kernel_real_single(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols) &
+               bind(c,name="launch_compute_hh_trafo_c_kernel_real_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value     :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t), value  :: q
+      integer(kind=c_intptr_t), value  :: hh_dot
+      integer(c_intptr_t), value       :: hh_tau ,hh
+    end subroutine
+  end interface
+
+#endif
+
+  interface
+    subroutine launch_compute_hh_trafo_c_kernel_complex_double(q, hh, hh_tau, nev, nb,ldq,off, ncols) &
+               bind(c,name="launch_compute_hh_trafo_c_kernel_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value    :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t), value :: q
+      integer(kind=c_intptr_t), value :: hh_tau ,hh
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+  interface
+    subroutine launch_compute_hh_trafo_c_kernel_complex_single(q, hh, hh_tau, nev, nb,ldq,off, ncols) &
+               bind(c,name="launch_compute_hh_trafo_c_kernel_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value    :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t), value :: q
+      integer(kind=c_intptr_t), value :: hh_tau ,hh
+    end subroutine
+  end interface
+
+#endif
+
+#if 0
+  interface
+    subroutine launch_compute_hh_trafo_c_kernel_complex_1_double(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols) &
+                bind(c,name="launch_compute_hh_trafo_c_kernel_complex_1_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value    :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t), value :: q
+      integer(kind=c_intptr_t), value :: hh_tau ,hh, hh_dot
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  interface
+    subroutine launch_compute_hh_trafo_c_kernel_complex_1_single(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols) &
+                bind(c,name="launch_compute_hh_trafo_c_kernel_complex_1_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value    :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t), value :: q
+      integer(kind=c_intptr_t), value :: hh_tau ,hh, hh_dot
+
+    end subroutine
+  end interface
+
+#endif
+
+#endif
+
+  interface
+    subroutine launch_my_unpack_c_kernel_real_double(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, &
+                                                l_nev,row_group_dev, a_dev) bind(c,name="launch_my_unpack_c_kernel_real_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value       :: row_count
+      integer(kind=c_int), value       :: n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev
+      integer(kind=c_intptr_t), value  :: a_dev, row_group_dev
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  interface
+    subroutine launch_my_unpack_c_kernel_real_single(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, &
+                                                l_nev,row_group_dev, a_dev) bind(c,name="launch_my_unpack_c_kernel_real_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value       :: row_count
+      integer(kind=c_int), value       :: n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev
+      integer(kind=c_intptr_t), value  :: a_dev, row_group_dev
+
+    end subroutine
+  end interface
+
+#endif
+
+  interface
+    subroutine launch_my_pack_c_kernel_real_double(row_count, n_offset, max_idx,stripe_width, a_dim2, &
+                                                     stripe_count, l_nev, a_dev, &
+                                                     row_group_dev) bind(c,name="launch_my_pack_c_kernel_real_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev
+      integer(kind=c_intptr_t), value :: a_dev
+      integer(kind=c_intptr_t), value :: row_group_dev
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  interface
+    subroutine launch_my_pack_c_kernel_real_single(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, &
+                                                     l_nev, a_dev, &
+                                              row_group_dev) bind(c,name="launch_my_pack_c_kernel_real_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev
+      integer(kind=c_intptr_t), value :: a_dev
+      integer(kind=c_intptr_t), value :: row_group_dev
+
+    end subroutine
+  end interface
+
+#endif
+
+  interface
+    subroutine launch_compute_hh_dotp_c_kernel_real_double(bcast_buffer_dev, hh_dot_dev, nbw, n) &
+               bind(c,name="launch_compute_hh_dotp_c_kernel_real_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t), value :: bcast_buffer_dev
+      integer(kind=c_intptr_t), value :: hh_dot_dev
+      integer(kind=c_int), value      :: nbw, n
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  interface
+    subroutine launch_compute_hh_dotp_c_kernel_real_single(bcast_buffer_dev, hh_dot_dev, nbw, n) &
+               bind(c,name="launch_compute_hh_dotp_c_kernel_real_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t), value :: bcast_buffer_dev
+      integer(kind=c_intptr_t), value :: hh_dot_dev
+      integer(kind=c_int), value      :: nbw, n
+
+    end subroutine
+  end interface
+
+#endif
+
+  interface
+    subroutine launch_extract_hh_tau_c_kernel_real_double(hh, hh_tau, nb, n, is_zero) &
+               bind(c,NAME="launch_extract_hh_tau_c_kernel_real_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t), value :: hh
+      integer(kind=c_intptr_t), value :: hh_tau
+      integer(kind=c_int), value    :: nb, n
+      integer(kind=c_int), value    :: is_zero
+
+    end subroutine
+  end interface
+#ifdef WANT_SINGLE_PRECISION_REAL
+  interface
+    subroutine launch_extract_hh_tau_c_kernel_real_single(hh, hh_tau, nb, n, is_zero) &
+               bind(c,NAME="launch_extract_hh_tau_c_kernel_real_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t), value :: hh
+      integer(kind=c_intptr_t), value :: hh_tau
+      integer(kind=c_int), value    :: nb, n
+      integer(kind=c_int), value    :: is_zero
+
+    end subroutine
+  end interface
+#endif
+
+  interface
+    subroutine launch_my_unpack_c_kernel_complex_double(row_count, n_offset, max_idx, stripe_width, a_dim2, &
+                                                          stripe_count, l_nev, &
+                                                 row_group_dev, a_dev) bind(c,name="launch_my_unpack_c_kernel_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+
+      integer(kind=c_int), value       :: row_count
+      integer(kind=c_int), value       :: n_offset, max_idx,stripe_width, a_dim2, stripe_count,l_nev
+      integer(kind=c_intptr_t), value  :: a_dev, row_group_dev
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+ interface
+    subroutine launch_my_unpack_c_kernel_complex_single(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, &
+                                                 row_group_dev, a_dev) bind(c,name="launch_my_unpack_c_kernel_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+
+      integer(kind=c_int), value       :: row_count
+      integer(kind=c_int), value       :: n_offset, max_idx,stripe_width, a_dim2, stripe_count,l_nev
+      integer(kind=c_intptr_t), value  :: a_dev, row_group_dev
+
+    end subroutine
+  end interface
+
+#endif
+
+  interface
+    subroutine launch_my_pack_c_kernel_complex_double(row_count, n_offset, max_idx,stripe_width,a_dim2, &
+                                                        stripe_count, l_nev, a_dev, &
+                                               row_group_dev) bind(c,name="launch_my_pack_c_kernel_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: row_count, n_offset, max_idx, stripe_width, a_dim2,stripe_count, l_nev
+      integer(kind=c_intptr_t), value :: a_dev
+      integer(kind=c_intptr_t), value :: row_group_dev
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  interface
+    subroutine launch_my_pack_c_kernel_complex_single(row_count, n_offset, max_idx,stripe_width,a_dim2, &
+                                                        stripe_count, l_nev, a_dev, &
+                                               row_group_dev) bind(c,name="launch_my_pack_c_kernel_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int), value      :: row_count, n_offset, max_idx, stripe_width, a_dim2,stripe_count, l_nev
+      integer(kind=c_intptr_t), value :: a_dev
+      integer(kind=c_intptr_t), value :: row_group_dev
+
+    end subroutine
+  end interface
+
+#endif
+
+  interface
+   subroutine launch_compute_hh_dotp_c_kernel_complex_double(bcast_buffer_dev, hh_dot_dev, nbw,n) &
+              bind(c,name="launch_compute_hh_dotp_c_kernel_complex_double")
+
+     use iso_c_binding
+
+     implicit none
+     integer(kind=c_intptr_t), value :: bcast_buffer_dev
+     integer(kind=c_intptr_t), value :: hh_dot_dev
+     integer(kind=c_int), value      :: nbw, n
+   end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  interface
+   subroutine launch_compute_hh_dotp_c_kernel_complex_single(bcast_buffer_dev, hh_dot_dev, nbw,n) &
+              bind(c,name="launch_compute_hh_dotp_c_kernel_complex_single")
+
+     use iso_c_binding
+
+     implicit none
+     integer(kind=c_intptr_t), value :: bcast_buffer_dev
+     integer(kind=c_intptr_t), value :: hh_dot_dev
+     integer(kind=c_int), value      :: nbw, n
+   end subroutine
+  end interface
+
+#endif
+
+  interface
+    subroutine launch_extract_hh_tau_c_kernel_complex_double(hh, hh_tau, nb, n, is_zero) &
+               bind(c,name="launch_extract_hh_tau_c_kernel_complex_double")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t), value :: hh
+      integer(kind=c_intptr_t), value :: hh_tau
+      integer(kind=c_int), value    :: nb, n
+      integer(kind=c_int), value    :: is_zero
+
+    end subroutine
+  end interface
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+  interface
+    subroutine launch_extract_hh_tau_c_kernel_complex_single(hh, hh_tau, nb, n, is_zero) &
+               bind(c,name="launch_extract_hh_tau_c_kernel_complex_single")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t), value :: hh
+      integer(kind=c_intptr_t), value :: hh_tau
+      integer(kind=c_int), value    :: nb, n
+      integer(kind=c_int), value    :: is_zero
+
+    end subroutine
+  end interface
+
+#endif
+
+  contains
+
+#if 0 /* not used anywhere */
+    subroutine launch_dot_product_kernel_complex_double(hs_dev, hv_new_dev, tau_new, x_dev, h_dev,hv_dev, nr)
+
+      use iso_c_binding
+      use precision
+      implicit none
+      integer(kind=c_int)      :: nr
+      integer(kind=C_intptr_T) :: hs_dev ,hv_new_dev,x_dev,h_dev, hv_dev
+      complex(kind=ck8)         :: tau_new
+#ifdef WITH_GPU_VERSION
+      call launch_dot_product_kernel_c_complex_double(hs_dev, hv_new_dev, tau_new, x_dev, h_dev,hv_dev, nr)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_dot_product_kernel_complex_single(hs_dev, hv_new_dev, tau_new, x_dev, h_dev,hv_dev, nr)
+
+      use iso_c_binding
+      use precision
+      implicit none
+      integer(kind=c_int)      :: nr
+      integer(kind=C_intptr_T) :: hs_dev ,hv_new_dev,x_dev,h_dev, hv_dev
+      complex(kind=ck4)         :: tau_new
+#ifdef WITH_GPU_VERSION
+      call launch_dot_product_kernel_c_complex_single(hs_dev, hv_new_dev, tau_new, x_dev, h_dev,hv_dev, nr)
+#endif
+    end subroutine
+#endif
+
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+
+    subroutine launch_dot_product_kernel_1_complex_double(ab_dev, hs_dev, hv_new_dev, x_dev,h_dev,hv_dev,nb, nr, ns)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      ::  nb, nr, ns
+      integer(kind=C_intptr_T) :: x_dev,h_dev, hv_dev, ab_dev, hs_dev,hv_new_dev
+#ifdef WITH_GPU_VERSION
+      call launch_dot_product_kernel_1_c_complex_double(ab_dev, hs_dev, hv_new_dev, x_dev,h_dev,hv_dev,nb, nr, ns)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_dot_product_kernel_1_complex_single(ab_dev, hs_dev, hv_new_dev, x_dev,h_dev,hv_dev,nb, nr, ns)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      ::  nb, nr, ns
+      integer(kind=C_intptr_T) :: x_dev,h_dev, hv_dev, ab_dev, hs_dev,hv_new_dev
+#ifdef WITH_GPU_VERSION
+      call launch_dot_product_kernel_1_c_complex_single(ab_dev, hs_dev, hv_new_dev, x_dev,h_dev,hv_dev,nb, nr, ns)
+#endif
+    end subroutine
+
+#endif
+
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+    subroutine launch_dot_product_kernel_2_complex_double(ab_dev, hs_dev, hv_dev,hd_dev,nb, nr, ne)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      ::  nb, nr, ne
+      integer(kind=C_intptr_T) :: hd_dev,hv_dev, hs_dev, ab_dev
+#ifdef WITH_GPU_VERSION
+      call launch_dot_product_kernel_2_c_complex_double(ab_dev, hs_dev, hv_dev,hd_dev,nb, nr, ne)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+    subroutine launch_dot_product_kernel_2_complex_single(ab_dev, hs_dev, hv_dev,hd_dev,nb, nr, ne)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      ::  nb, nr, ne
+      integer(kind=C_intptr_T) :: hd_dev,hv_dev, hs_dev, ab_dev
+#ifdef WITH_GPU_VERSION
+      call launch_dot_product_kernel_2_c_complex_single(ab_dev, hs_dev, hv_dev,hd_dev,nb, nr, ne)
+#endif
+    end subroutine
+#endif
+
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+    subroutine launch_double_hh_transform_1_complex_double(ab_dev, hs_dev,hv_dev,nb,ns)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      ::  nb, ns
+      integer(kind=C_intptr_T) :: hv_dev, ab_dev,hs_dev
+#ifdef WITH_GPU_VERSION
+      call launch_double_hh_transform_1_c_complex_double(ab_dev, hs_dev,hv_dev,nb,ns)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_double_hh_transform_1_complex_single(ab_dev, hs_dev,hv_dev,nb,ns)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      ::  nb, ns
+      integer(kind=C_intptr_T) :: hv_dev, ab_dev,hs_dev
+#ifdef WITH_GPU_VERSION
+      call launch_double_hh_transform_1_c_complex_single(ab_dev, hs_dev,hv_dev,nb,ns)
+#endif
+    end subroutine
+
+#endif
+#endif /*  not used anywhere */
+
+#if 0 /* not used anywhere */
+    subroutine launch_double_hh_transform_2_complex_double(ab_dev, hd_dev,hv_dev,nc,ns, nb)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      ::  nc, ns, nb
+      integer(kind=C_intptr_T) :: hv_dev, ab_dev,hd_dev
+#ifdef WITH_GPU_VERSION
+      call launch_double_hh_transform_2_c_complex_double(ab_dev, hd_dev,hv_dev,nc,ns, nb)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+    subroutine launch_double_hh_transform_2_complex_single(ab_dev, hd_dev,hv_dev,nc,ns, nb)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      ::  nc, ns, nb
+      integer(kind=C_intptr_T) :: hv_dev, ab_dev,hd_dev
+#ifdef WITH_GPU_VERSION
+      call launch_double_hh_transform_2_c_complex_single(ab_dev, hd_dev,hv_dev,nc,ns, nb)
+#endif
+    end subroutine
+
+#endif
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+    subroutine launch_compute_kernel_reduce_complex_double(a_dev, lda, n, nbw, h1_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      :: n,lda,nbw
+      integer(kind=C_intptr_T) :: h1_dev ,a_dev
+#ifdef WITH_GPU_VERSION
+      call launch_compute_kernel_reduce_c_complex_double(a_dev, lda, n, nbw, h1_dev)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_compute_kernel_reduce_complex_single(a_dev, lda, n, nbw, h1_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      :: n,lda,nbw
+      integer(kind=C_intptr_T) :: h1_dev ,a_dev
+#ifdef WITH_GPU_VERSION
+      call launch_compute_kernel_reduce_c_complex_single(a_dev, lda, n, nbw, h1_dev)
+#endif
+    end subroutine
+
+#endif
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+
+    subroutine launch_compute_kernel_reduce_1_complex_double(a_dev, lda, n, h1_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      :: n,lda
+      integer(kind=C_intptr_T) :: h1_dev ,a_dev
+#ifdef WITH_GPU_VERSION
+      call launch_compute_kernel_reduce_1_c_complex_double(a_dev, lda, n, h1_dev)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_compute_kernel_reduce_1_complex_single(a_dev, lda, n, h1_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      :: n,lda
+      integer(kind=C_intptr_T) :: h1_dev ,a_dev
+#ifdef WITH_GPU_VERSION
+      call launch_compute_kernel_reduce_1_c_complex_single(a_dev, lda, n, h1_dev)
+#endif
+    end subroutine
+
+#endif
+#endif /* not used anywhere */
+
+    subroutine launch_compute_hh_trafo_gpu_kernel_real_double(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)       :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t)  :: q
+      integer(kind=c_intptr_t)  :: hh_dot
+      integer(c_intptr_t)       :: hh_tau ,hh
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_trafo_c_kernel_real_double(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+    subroutine launch_compute_hh_trafo_gpu_kernel_real_single(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)     :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t)  :: q
+      integer(kind=c_intptr_t)  :: hh_dot
+      integer(c_intptr_t)       :: hh_tau ,hh
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_trafo_c_kernel_real_single(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols)
+#endif
+    end subroutine
+
+#endif
+
+    subroutine launch_compute_hh_trafo_gpu_kernel_complex_double(q, hh, hh_tau, nev, nb,ldq,off, ncols)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)    :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t) :: q
+      integer(kind=c_intptr_t) :: hh_tau ,hh
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_trafo_c_kernel_complex_double(q, hh, hh_tau, nev, nb,ldq,off, ncols)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_compute_hh_trafo_gpu_kernel_complex_single(q, hh, hh_tau, nev, nb,ldq,off, ncols)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)    :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t) :: q
+      integer(kind=c_intptr_t) :: hh_tau ,hh
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_trafo_c_kernel_complex_single(q, hh, hh_tau, nev, nb,ldq,off, ncols)
+#endif
+    end subroutine
+
+#endif
+
+#if 0
+    subroutine launch_compute_hh_trafo_gpu_kernel_1_complex_double(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)    :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t) :: q
+      integer(kind=c_intptr_t) :: hh_tau ,hh, hh_dot
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_trafo_c_kernel_complex_1_double(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_compute_hh_trafo_gpu_kernel_1_complex_single(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)    :: nev, nb, ldq, off, ncols
+      integer(kind=c_intptr_t) :: q
+      integer(kind=c_intptr_t) :: hh_tau ,hh, hh_dot
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_trafo_c_kernel_complex_1_single(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols)
+#endif
+    end subroutine
+#endif
+
+#endif
+
+    subroutine launch_my_unpack_gpu_kernel_real_double(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, &
+                                              l_nev,row_group_dev, a_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)       :: row_count
+      integer(kind=c_int)       :: n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev
+      integer(kind=c_intptr_t)  :: a_dev, row_group_dev
+#ifdef WITH_GPU_VERSION
+      call launch_my_unpack_c_kernel_real_double(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, &
+                                            l_nev,row_group_dev, a_dev)
+#endif
+
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+    subroutine launch_my_unpack_gpu_kernel_real_single(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, &
+                                              l_nev,row_group_dev, a_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)       :: row_count
+      integer(kind=c_int)       :: n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev
+      integer(kind=c_intptr_t)  :: a_dev, row_group_dev
+#ifdef WITH_GPU_VERSION
+      call launch_my_unpack_c_kernel_real_single(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, &
+                                            l_nev,row_group_dev, a_dev)
+#endif
+
+    end subroutine
+
+#endif
+
+    subroutine launch_my_pack_gpu_kernel_real_double(row_count, n_offset, max_idx,stripe_width, a_dim2, &
+                                                     stripe_count, l_nev, a_dev,  row_group_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      :: row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev
+      integer(kind=c_intptr_t) :: a_dev
+      integer(kind=c_intptr_t) :: row_group_dev
+#ifdef WITH_GPU_VERSION
+      call launch_my_pack_c_kernel_real_double(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev, a_dev, &
+                                       row_group_dev)
+#endif
+
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+    subroutine launch_my_pack_gpu_kernel_real_single(row_count, n_offset, max_idx,stripe_width, &
+                                                     a_dim2, stripe_count, l_nev, a_dev,        &
+                                                     row_group_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      :: row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev
+      integer(kind=c_intptr_t) :: a_dev
+      integer(kind=c_intptr_t) :: row_group_dev
+#ifdef WITH_GPU_VERSION
+      call launch_my_pack_c_kernel_real_single(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev, a_dev, &
+                                       row_group_dev)
+#endif
+
+    end subroutine
+
+#endif
+
+    subroutine launch_compute_hh_dotp_gpu_kernel_real_double(bcast_buffer_dev, hh_dot_dev, nbw, n)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t) :: bcast_buffer_dev
+      integer(kind=c_intptr_t) :: hh_dot_dev
+      integer(kind=c_int)      :: nbw, n
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_dotp_c_kernel_real_double(bcast_buffer_dev, hh_dot_dev, nbw, n)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+
+    subroutine launch_compute_hh_dotp_gpu_kernel_real_single(bcast_buffer_dev, hh_dot_dev, nbw, n)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t) :: bcast_buffer_dev
+      integer(kind=c_intptr_t) :: hh_dot_dev
+      integer(kind=c_int)      :: nbw, n
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_dotp_c_kernel_real_single(bcast_buffer_dev, hh_dot_dev, nbw, n)
+#endif
+    end subroutine
+
+#endif
+
+    subroutine launch_extract_hh_tau_gpu_kernel_real_double(hh, hh_tau, nb, n, is_zero)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t) :: hh
+      integer(kind=c_intptr_t) :: hh_tau
+      integer(kind=c_int)    :: nb, n
+      integer(kind=c_int)    :: is_zero
+#ifdef WITH_GPU_VERSION
+      call launch_extract_hh_tau_c_kernel_real_double(hh, hh_tau, nb, n, is_zero)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+    subroutine launch_extract_hh_tau_gpu_kernel_real_single(hh, hh_tau, nb, n, is_zero)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t) :: hh
+      integer(kind=c_intptr_t) :: hh_tau
+      integer(kind=c_int)    :: nb, n
+      integer(kind=c_int)    :: is_zero
+#ifdef WITH_GPU_VERSION
+      call launch_extract_hh_tau_c_kernel_real_single(hh, hh_tau, nb, n, is_zero)
+#endif
+    end subroutine
+
+#endif
+
+    subroutine launch_my_unpack_gpu_kernel_complex_double(row_count, n_offset, max_idx, stripe_width, &
+                                                          a_dim2, stripe_count, l_nev, row_group_dev, a_dev)
+
+      use iso_c_binding
+
+      implicit none
+
+      integer(kind=c_int)       :: row_count
+      integer(kind=c_int)       :: n_offset, max_idx,stripe_width, a_dim2, stripe_count,l_nev
+      integer(kind=c_intptr_t)  :: a_dev, row_group_dev
+#ifdef WITH_GPU_VERSION
+      call launch_my_unpack_c_kernel_complex_double(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, &
+                                                 row_group_dev, a_dev)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_my_unpack_gpu_kernel_complex_single(row_count, n_offset, max_idx, stripe_width, &
+                                                          a_dim2, stripe_count, l_nev, row_group_dev, a_dev)
+
+      use iso_c_binding
+
+      implicit none
+
+      integer(kind=c_int)       :: row_count
+      integer(kind=c_int)       :: n_offset, max_idx,stripe_width, a_dim2, stripe_count,l_nev
+      integer(kind=c_intptr_t)  :: a_dev, row_group_dev
+#ifdef WITH_GPU_VERSION
+      call launch_my_unpack_c_kernel_complex_single(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, &
+                                                 row_group_dev, a_dev)
+#endif
+    end subroutine
+
+#endif
+
+    subroutine launch_my_pack_gpu_kernel_complex_double(row_count, n_offset, max_idx,stripe_width,a_dim2, &
+                                                      stripe_count, l_nev, a_dev, &
+                                               row_group_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      :: row_count, n_offset, max_idx, stripe_width, a_dim2,stripe_count, l_nev
+      integer(kind=c_intptr_t) :: a_dev
+      integer(kind=c_intptr_t) :: row_group_dev
+#ifdef WITH_GPU_VERSION
+      call launch_my_pack_c_kernel_complex_double(row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, a_dev, &
+                                               row_group_dev)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+    subroutine launch_my_pack_gpu_kernel_complex_single(row_count, n_offset, max_idx,stripe_width,a_dim2, &
+                                                      stripe_count, l_nev, a_dev, &
+                                               row_group_dev)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_int)      :: row_count, n_offset, max_idx, stripe_width, a_dim2,stripe_count, l_nev
+      integer(kind=c_intptr_t) :: a_dev
+      integer(kind=c_intptr_t) :: row_group_dev
+#ifdef WITH_GPU_VERSION 
+      call launch_my_pack_c_kernel_complex_single(row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, a_dev, &
+                                               row_group_dev)
+#endif
+    end subroutine
+
+#endif
+
+    subroutine launch_compute_hh_dotp_gpu_kernel_complex_double(bcast_buffer_dev, hh_dot_dev, nbw,n)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t) :: bcast_buffer_dev
+      integer(kind=c_intptr_t) :: hh_dot_dev
+      integer(kind=c_int)      :: nbw, n
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_dotp_c_kernel_complex_double(bcast_buffer_dev, hh_dot_dev, nbw,n)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_compute_hh_dotp_gpu_kernel_complex_single(bcast_buffer_dev, hh_dot_dev, nbw,n)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t) :: bcast_buffer_dev
+      integer(kind=c_intptr_t) :: hh_dot_dev
+      integer(kind=c_int)      :: nbw, n
+#ifdef WITH_GPU_VERSION
+      call launch_compute_hh_dotp_c_kernel_complex_single(bcast_buffer_dev, hh_dot_dev, nbw,n)
+#endif
+    end subroutine
+#endif
+
+    subroutine launch_extract_hh_tau_gpu_kernel_complex_double(hh, hh_tau, nb, n, is_zero)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t) :: hh
+      integer(kind=c_intptr_t) :: hh_tau
+      integer(kind=c_int)    :: nb, n
+      integer(kind=c_int)    :: is_zero
+#ifdef WITH_GPU_VERSION
+      call launch_extract_hh_tau_c_kernel_complex_double(hh, hh_tau, nb, n, is_zero)
+#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    subroutine launch_extract_hh_tau_gpu_kernel_complex_single(hh, hh_tau, nb, n, is_zero)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=c_intptr_t) :: hh
+      integer(kind=c_intptr_t) :: hh_tau
+      integer(kind=c_int)    :: nb, n
+      integer(kind=c_int)    :: is_zero
+#ifdef WITH_GPU_VERSION
+      call launch_extract_hh_tau_c_kernel_complex_single(hh, hh_tau, nb, n, is_zero)
+#endif
+    end subroutine
+#endif
+end module cuda_c_kernel
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/asm_x86_64_double_precision.s elpa-2019.11.001/src/elpa2/kernels/asm_x86_64_double_precision.s
--- elpa-2016.05.001/src/elpa2/kernels/asm_x86_64_double_precision.s	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/asm_x86_64_double_precision.s	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,768 @@
+#    This file is part of ELPA.
+#
+#    The ELPA library was originally created by the ELPA consortium,
+#    consisting of the following organizations:
+#
+#    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+#      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+#    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+#      Informatik,
+#    - Technische Universität München, Lehrstuhl für Informatik mit
+#      Schwerpunkt Wissenschaftliches Rechnen ,
+#    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+#    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+#      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+#      and
+#    - IBM Deutschland GmbH
+#
+#
+#    More information can be found here:
+#    http://elpa.mpcdf.mpg.de/
+#
+#    ELPA is free software: you can redistribute it and/or modify
+#    it under the terms of the version 3 of the license of the
+#    GNU Lesser General Public License as published by the Free
+#    Software Foundation.
+#
+#    ELPA is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU Lesser General Public License for more details.
+#
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+#
+#    ELPA reflects a substantial effort on the part of the original
+#    ELPA consortium, and we ask you to respect the spirit of the
+#    license that we chose: i.e., please contribute any changes you
+#    may have back to the original ELPA library distribution, and keep
+#    any derivatives of ELPA under the same license that we chose for
+#    the original distribution, the GNU Lesser General Public License.
+#
+
+
+# --------------------------------------------------------------------------------------------------
+#
+# This file contains the compute intensive kernels for the Householder transformations,
+# coded in x86_64 assembler and using SSE2/SSE3 instructions.
+#
+# It must be assembled with GNU assembler (just "as" on most Linux machines)
+#
+# Copyright of the original code rests with the authors inside the ELPA
+# consortium. The copyright of any additional modifications shall rest
+# with their original authors, but shall adhere to the licensing terms
+# distributed along with the original code in the file "COPYING".
+#
+# --------------------------------------------------------------------------------------------------
+        .globl double_hh_trafo_real_double_sse_assembly
+        .globl single_hh_trafo_complex_double_sse_assembly
+        .text
+
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+        .macro hh_trafo_real nrows
+
+        # When this macro is called, the following registers are set and must not be changed
+        # %rdi: Address of q
+        # %rsi: Address of hh
+        # %rdx: nb
+        # %rcx: Remaining rows nq
+        # %r8:  ldq in bytes
+        # %r9:  ldh in bytes
+        # %rax: address of hh at the end of the loops
+        # The top of the stack must contain the dot product of the two Householder vectors
+
+        movq      %rdi, %r10   # Copy address of q
+        movq      %rsi, %r11   # Copy address of hh
+
+
+#   x1 = q(1,2)
+#   x2 = q(2,2)
+#
+#   y1 = q(1,1) + q(1,2)*hh(2,2)
+#   y2 = q(2,1) + q(2,2)*hh(2,2)
+
+        movaps      (%r10), %xmm6       # y1 = q(1,1)
+        movaps    16(%r10), %xmm7       # y2 = q(2,1)
+        .if \nrows>=8
+        movaps    32(%r10), %xmm8
+        movaps    48(%r10), %xmm9
+        .if \nrows==12
+        movaps    64(%r10), %xmm10
+        movaps    80(%r10), %xmm11
+        .endif
+        .endif
+
+        addq      %r8, %r10             # %r10 => q(.,2)
+        movddup   8(%r11,%r9), %xmm15   #  hh(2,2)
+
+        .macro mac_pre_loop1 qoff, X, Y
+        movaps    \qoff(%r10), \X       # xn = q(n,2)
+        movaps    \X, %xmm12
+        mulpd     %xmm15, %xmm12
+        addpd     %xmm12, \Y            # yn = yn + xn*h(2,2)
+        .endm
+
+        mac_pre_loop1  0, %xmm0, %xmm6
+        mac_pre_loop1 16, %xmm1, %xmm7
+        .if \nrows>=8
+        mac_pre_loop1 32, %xmm2, %xmm8
+        mac_pre_loop1 48, %xmm3, %xmm9
+        .if \nrows==12
+        mac_pre_loop1 64, %xmm4, %xmm10
+        mac_pre_loop1 80, %xmm5, %xmm11
+        .endif
+        .endif
+        .purgem   mac_pre_loop1
+
+#   do i=3,nb
+#      h1 = hh(i-1,1)
+#      h2 = hh(i,2)
+#      x1 = x1 + q(1,i)*h1
+#      y1 = y1 + q(1,i)*h2
+#      x2 = x2 + q(2,i)*h1
+#      y2 = y2 + q(2,i)*h2
+#      ...
+#   enddo
+
+        addq      $8, %r11
+        .align 16
+1:
+        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
+        jge       2f
+
+        addq      %r8, %r10             # %r10 => q(.,i)
+
+        movddup   (%r11), %xmm14        # hh(i-1,1)
+        movddup   8(%r11,%r9), %xmm15   # hh(i,2)
+
+        .macro mac_loop1 qoff, X, Y
+        movaps    \qoff(%r10), %xmm13   # q(.,i)
+        movaps    %xmm13, %xmm12
+        mulpd     %xmm14, %xmm13
+        addpd     %xmm13, \X            # xn = xn + q(.,i)*h1
+        mulpd     %xmm15, %xmm12
+        addpd     %xmm12, \Y            # yn = yn + q(.,i)*h2
+        .endm
+
+        mac_loop1  0, %xmm0, %xmm6
+        mac_loop1 16, %xmm1, %xmm7
+        .if \nrows>=8
+        mac_loop1 32, %xmm2, %xmm8
+        mac_loop1 48, %xmm3, %xmm9
+        .if \nrows==12
+        mac_loop1 64, %xmm4, %xmm10
+        mac_loop1 80, %xmm5, %xmm11
+        .endif
+        .endif
+        .purgem   mac_loop1
+
+        addq      $8, %r11
+        jmp       1b
+2:
+
+#   x1 = x1 + q(1,nb+1)*hh(nb,1)
+#   x2 = x2 + q(2,nb+1)*hh(nb,1)
+
+        addq      %r8, %r10             # %r10 => q(.,nb+1)
+        movddup   (%r11), %xmm14
+
+        .macro mac_post_loop1 qoff, X
+        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
+        mulpd     %xmm14, %xmm13
+        addpd     %xmm13, \X
+        .endm
+
+        mac_post_loop1  0, %xmm0
+        mac_post_loop1 16, %xmm1
+        .if \nrows>=8
+        mac_post_loop1 32, %xmm2
+        mac_post_loop1 48, %xmm3
+        .if \nrows==12
+        mac_post_loop1 64, %xmm4
+        mac_post_loop1 80, %xmm5
+        .endif
+        .endif
+        .purgem   mac_post_loop1
+
+#   tau1 = hh(1,1)
+#   tau2 = hh(1,2)
+#
+#   h1 = -tau1
+#   x1 = x1*h1
+#   x2 = x2*h1
+
+        movq      %rsi, %r11    # restore %r11 (hh(1,1))
+
+        movddup (%r11), %xmm12 # hh(1,1)
+        xorps   %xmm14, %xmm14
+        subpd   %xmm12, %xmm14 # %xmm14 = -hh(1,1)
+
+        mulpd   %xmm14, %xmm0
+        mulpd   %xmm14, %xmm1
+        .if \nrows>=8
+        mulpd   %xmm14, %xmm2
+        mulpd   %xmm14, %xmm3
+        .if \nrows==12
+        mulpd   %xmm14, %xmm4
+        mulpd   %xmm14, %xmm5
+        .endif
+        .endif
+
+#   h1 = -tau2
+#   h2 = -tau2*s
+#   y1 = y1*h1 + x1*h2
+#   y2 = y2*h1 + x2*h2
+
+        movddup (%r11,%r9), %xmm12  # hh(1,2)
+        xorps   %xmm15, %xmm15
+        subpd   %xmm12, %xmm15 # %xmm15 = -hh(1,2) = h1
+        movaps  %xmm15, %xmm14
+        movddup (%rsp), %xmm12 # Get s from top of stack
+        mulpd   %xmm12, %xmm14 # %xmm14 = h2
+
+        .macro mac_xform_y X, Y
+        mulpd   %xmm15, \Y  # y1 = y1*h1
+        movaps  \X, %xmm12
+        mulpd   %xmm14, %xmm12
+        addpd   %xmm12, \Y
+        .endm
+
+        mac_xform_y %xmm0, %xmm6
+        mac_xform_y %xmm1, %xmm7
+        .if \nrows>=8
+        mac_xform_y %xmm2, %xmm8
+        mac_xform_y %xmm3, %xmm9
+        .if \nrows==12
+        mac_xform_y %xmm4, %xmm10
+        mac_xform_y %xmm5, %xmm11
+        .endif
+        .endif
+        .purgem   mac_xform_y
+
+#   q(1,1) = q(1,1) + y1
+#   q(2,1) = q(2,1) + y2
+
+        movq   %rdi, %r10   # restore original Q
+
+        .macro mac_pre_loop2_1 qoff, Y
+        movaps    \qoff(%r10), %xmm13   # q(.,1)
+        addpd     \Y, %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_pre_loop2_1  0, %xmm6
+        mac_pre_loop2_1 16, %xmm7
+        .if \nrows>=8
+        mac_pre_loop2_1 32, %xmm8
+        mac_pre_loop2_1 48, %xmm9
+        .if \nrows==12
+        mac_pre_loop2_1 64, %xmm10
+        mac_pre_loop2_1 80, %xmm11
+        .endif
+        .endif
+        .purgem   mac_pre_loop2_1
+
+#   q(1,2) = q(1,2) + x1 + y1*hh(2,2)
+#   q(2,2) = q(2,2) + x2 + y2*hh(2,2)
+
+        addq      %r8, %r10             # %r10 => q(.,2)
+
+        movddup   8(%r11,%r9), %xmm15   # hh(2,2)
+
+        .macro mac_pre_loop2_2 qoff, X, Y
+        movaps    \X, %xmm13
+        movaps    \Y, %xmm12
+        mulpd     %xmm15, %xmm12
+        addpd     %xmm12, %xmm13
+        addpd     \qoff(%r10), %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_pre_loop2_2  0, %xmm0, %xmm6
+        mac_pre_loop2_2 16, %xmm1, %xmm7
+        .if \nrows>=8
+        mac_pre_loop2_2 32, %xmm2, %xmm8
+        mac_pre_loop2_2 48, %xmm3, %xmm9
+        .if \nrows==12
+        mac_pre_loop2_2 64, %xmm4, %xmm10
+        mac_pre_loop2_2 80, %xmm5, %xmm11
+        .endif
+        .endif
+        .purgem   mac_pre_loop2_2
+
+#   do i=3,nb
+#      h1 = hh(i-1,1)
+#      h2 = hh(i,2)
+#      q(1,i) = q(1,i) + x1*h1 + y1*h2
+#      q(2,i) = q(2,i) + x2*h1 + y2*h2
+#   enddo
+
+        addq      $8, %r11
+        .align 16
+1:
+        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
+        jge       2f
+
+        addq      %r8, %r10             # %r10 => q(.,i)
+
+        movddup   (%r11), %xmm14        # hh(i-1,1)
+        movddup   8(%r11,%r9), %xmm15   # hh(i,2)
+
+        .macro mac_loop2 qoff, X, Y
+        movaps    \X, %xmm13
+        mulpd     %xmm14, %xmm13
+        movaps    \Y, %xmm12
+        mulpd     %xmm15, %xmm12
+        addpd     %xmm12, %xmm13
+        addpd     \qoff(%r10), %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_loop2  0, %xmm0, %xmm6
+        mac_loop2 16, %xmm1, %xmm7
+        .if \nrows>=8
+        mac_loop2 32, %xmm2, %xmm8
+        mac_loop2 48, %xmm3, %xmm9
+        .if \nrows==12
+        mac_loop2 64, %xmm4, %xmm10
+        mac_loop2 80, %xmm5, %xmm11
+        .endif
+        .endif
+        .purgem   mac_loop2
+
+        addq      $8, %r11
+        jmp       1b
+2:
+
+#   q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
+#   q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
+
+        addq      %r8, %r10             # %r10 => q(.,nb+1)
+        movddup   (%r11), %xmm14
+
+        .macro mac_post_loop2 qoff, X
+        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
+        mulpd     %xmm14, \X
+        addpd     \X, %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_post_loop2  0, %xmm0
+        mac_post_loop2 16, %xmm1
+        .if \nrows>=8
+        mac_post_loop2 32, %xmm2
+        mac_post_loop2 48, %xmm3
+        .if \nrows==12
+        mac_post_loop2 64, %xmm4
+        mac_post_loop2 80, %xmm5
+        .endif
+        .endif
+        .purgem   mac_post_loop2
+
+        .endm
+
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+# FORTRAN Interface:
+#
+# subroutine double_hh_trafo_real_double_sse_assembly(q, hh, nb, nq, ldq, ldh)
+#
+#   integer, intent(in) :: nb, nq, ldq, ldh
+#   real*8, intent(inout) :: q(ldq,*)
+#   real*8, intent(in) :: hh(ldh,*)
+#
+# Parameter mapping to registers
+#   parameter 1: %rdi : q
+#   parameter 2: %rsi : hh
+#   parameter 3: %rdx : nb
+#   parameter 4: %rcx : nq
+#   parameter 5: %r8  : ldq
+#   parameter 6: %r9  : ldh
+#
+#-------------------------------------------------------------------------------
+
+#!f>#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
+#!f>  interface
+#!f>    subroutine double_hh_trafo_real_double_sse_assembly(q, hh, nb, nq, ldq, ldh) &
+#!f>      bind(C,name="double_hh_trafo_real_double_sse_assembly")
+#!f>      use, intrinsic :: iso_c_binding
+#!f>      integer(kind=c_int)  :: nb, nq, ldq, ldh
+#!f>      type(c_ptr), value   :: q
+#!f>      real(kind=c_double_complex)  :: hh(nb,6)
+#!f>    end subroutine
+#!f>  end interface
+#!f>#endif
+        .align    16,0x90
+double_hh_trafo_real_double_sse_assembly:
+
+        # Get integer parameters into corresponding registers
+
+        movslq    (%rdx), %rdx # nb
+        movslq    (%rcx), %rcx # nq
+        movslq    (%r8),  %r8  # ldq
+        movslq    (%r9),  %r9  # ldh
+
+        # Get ldq in bytes
+        addq      %r8, %r8
+        addq      %r8, %r8
+        addq      %r8, %r8 # 8*ldq, i.e. ldq in bytes
+
+        # Get ldh in bytes
+        addq      %r9, %r9
+        addq      %r9, %r9
+        addq      %r9, %r9 # 8*ldh, i.e. ldh in bytes
+
+        # set %rax to the address of hh at the end of the loops,
+        # i.e. if %rdx >= %rax we must jump out of the loop.
+        # please note: %rax = 8*%rdx + %rsi - 8
+        movq %rdx, %rax
+        addq %rax, %rax
+        addq %rax, %rax
+        addq %rax, %rax
+        addq %rsi, %rax
+        subq $8, %rax
+
+#-----------------------------------------------------------
+        # Calculate the dot product of the two Householder vectors
+
+        # decrement stack pointer to make space for s
+        subq $8, %rsp
+
+#   Fortran code:
+#   s = hh(2,2)*1
+#   do i=3,nb
+#      s = s+hh(i,2)*hh(i-1,1)
+#   enddo
+
+        movq      %rsi, %r11   # Copy address of hh
+
+        movsd     8(%r11,%r9), %xmm0 #  hh(2,2)
+        addq      $8, %r11
+1:
+        cmpq %rax, %r11
+        jge       2f
+        movsd   (%r11), %xmm14       # hh(i-1,1)
+        movsd   8(%r11,%r9), %xmm15  # hh(i,2)
+        mulsd   %xmm14, %xmm15
+        addsd   %xmm15, %xmm0
+        addq      $8, %r11
+        jmp       1b
+2:
+        movsd   %xmm0, (%rsp)   # put s on top of stack
+#-----------------------------------------------------------
+
+rloop_s:
+        cmpq      $8, %rcx   # if %rcx <= 8 jump out of loop
+        jle       rloop_e
+        hh_trafo_real 12 # transform 12 rows
+        addq      $96, %rdi  # increment q start adress by 96 bytes (6 rows)
+        subq      $12, %rcx  # decrement nq
+        jmp       rloop_s
+rloop_e:
+
+        cmpq      $4, %rcx   # if %rcx <= 4 jump to test_2
+        jle       test_4
+        hh_trafo_real 8 # transform 8 rows
+        jmp       return1
+
+test_4:
+        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
+        jle       return1
+        hh_trafo_real 4 # transform 4 rows
+
+return1:
+        addq      $8, %rsp   # reset stack pointer
+        ret
+
+        .align    16,0x90
+
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+        .macro hh_trafo_complex nrows
+
+        # When this macro is called, the following registers are set and must not be changed
+        # %rdi: Address of q
+        # %rsi: Address of hh
+        # %rdx: nb
+        # %rcx: Remaining rows nq
+        # %r8:  ldq in bytes
+
+        movq      %rdi, %r10   # Copy address of q
+        movq      %rsi, %r11   # Copy address of hh
+
+        # set %rax to the address of hh at the end of the loops,
+        # i.e. if %rdx >= %rax we must jump out of the loop.
+        # please note: %rax = 16*%rdx + %rsi
+        movq %rdx, %rax
+        addq %rax, %rax
+        addq %rax, %rax
+        addq %rax, %rax
+        addq %rax, %rax
+        addq %rsi, %rax
+
+#   x1 = q(1,1); y1 = 0
+#   x2 = q(2,1); y2 = 0
+#   ...
+
+        movaps      (%r10), %xmm0
+        movaps    16(%r10), %xmm1
+        xorps     %xmm6, %xmm6
+        xorps     %xmm7, %xmm7
+        .if \nrows>=4
+        movaps    32(%r10), %xmm2
+        movaps    48(%r10), %xmm3
+        xorps     %xmm8, %xmm8
+        xorps     %xmm9, %xmm9
+        .if \nrows==6
+        movaps    64(%r10), %xmm4
+        movaps    80(%r10), %xmm5
+        xorps     %xmm10, %xmm10
+        xorps     %xmm11, %xmm11
+        .endif
+        .endif
+
+#   do i=2,nb
+#      h1 = conjg(hh(i))
+#      x1 = x1 + q(1,i)*h1
+#      x2 = x2 + q(2,i)*h1
+#      ...
+#   enddo
+
+        addq      $16, %r11  # %r11 => hh(2)
+        .align 16
+1:
+        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
+        jge 2f
+
+        addq      %r8, %r10       # %r10 => q(.,i)
+
+        movddup    (%r11), %xmm14 # real(hh(i))
+        movddup   8(%r11), %xmm15 # imag(hh(i))
+
+        .macro mac_loop1 qoff, X, Y
+        movaps    \qoff(%r10), %xmm13     # q(.,i)
+        movaps    %xmm13, %xmm12
+        mulpd     %xmm14, %xmm13          # q(.,i)*real(hh(i))
+        addpd     %xmm13, \X              # x1 = x1 + q(.,i)*real(hh(i))
+        mulpd     %xmm15, %xmm12          # q(.,i)*imag(hh(i))
+        addsubpd  %xmm12, \Y              # y1 = y1 -/+ q(.,i)*imag(hh(i))
+        .endm
+
+        mac_loop1   0, %xmm0, %xmm6
+        mac_loop1  16, %xmm1, %xmm7
+        .if \nrows>=4
+        mac_loop1  32, %xmm2, %xmm8
+        mac_loop1  48, %xmm3, %xmm9
+        .if \nrows==6
+        mac_loop1  64, %xmm4, %xmm10
+        mac_loop1  80, %xmm5, %xmm11
+        .endif
+        .endif
+
+        .purgem   mac_loop1
+
+        addq      $16, %r11                # %r11 => hh(i+1)
+        jmp       1b
+2:
+
+        # Now the content of the yn has to be swapped and added to xn
+        .macro mac_post_loop_1 X, Y
+        shufpd $1, \Y, \Y
+        addpd  \Y, \X
+        .endm
+
+        mac_post_loop_1  %xmm0, %xmm6
+        mac_post_loop_1  %xmm1, %xmm7
+        .if \nrows>=4
+        mac_post_loop_1  %xmm2, %xmm8
+        mac_post_loop_1  %xmm3, %xmm9
+        .if \nrows==6
+        mac_post_loop_1  %xmm4, %xmm10
+        mac_post_loop_1  %xmm5, %xmm11
+        .endif
+        .endif
+        .purgem   mac_post_loop_1
+
+#   tau1 = hh(1)
+#
+#   h1 = -tau1
+#   x1 = x1*h1; y1 = x1 with halfes exchanged
+#   x2 = x2*h1; y2 = x2 with halfes exchanged
+#   ...
+
+        movq      %rsi, %r11      # restore address of hh
+
+        xorps     %xmm14, %xmm14
+        movddup    (%r11), %xmm12 # real(hh(1))
+        subpd     %xmm12, %xmm14  #-real(hh(1))
+        xorps     %xmm15, %xmm15
+        movddup   8(%r11), %xmm12 # imag(hh(1))
+        subpd     %xmm12, %xmm15  #-imag(hh(1))
+
+        .macro mac_xform X, Y
+        movaps    \X, %xmm12
+        shufpd    $1, \X, %xmm12
+        mulpd     %xmm15, %xmm12
+        mulpd     %xmm14, \X
+        addsubpd  %xmm12, \X
+        movaps    \X, \Y          # copy to y
+        shufpd    $1, \X, \Y      # exchange halfes
+        .endm
+
+        mac_xform %xmm0, %xmm6
+        mac_xform %xmm1, %xmm7
+        .if \nrows>=4
+        mac_xform %xmm2, %xmm8
+        mac_xform %xmm3, %xmm9
+        .if \nrows==6
+        mac_xform %xmm4, %xmm10
+        mac_xform %xmm5, %xmm11
+        .endif
+        .endif
+        .purgem mac_xform
+
+#   q(1,1) = q(1,1) + x1
+#   q(2,1) = q(2,1) + x2
+#   ...
+
+        movq      %rdi, %r10      # restore address of q
+        .macro mac_pre_loop2 qoff, X
+        movaps    \qoff(%r10), %xmm13     # q(.,1)
+        addpd     \X, %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_pre_loop2   0, %xmm0
+        mac_pre_loop2  16, %xmm1
+        .if \nrows>=4
+        mac_pre_loop2  32, %xmm2
+        mac_pre_loop2  48, %xmm3
+        .if \nrows==6
+        mac_pre_loop2  64, %xmm4
+        mac_pre_loop2  80, %xmm5
+        .endif
+        .endif
+        .purgem mac_pre_loop2
+
+#   do i=2,nb
+#      h1 = hh(i)
+#      q(1,i) = q(1,i) + x1*h1
+#      q(2,i) = q(2,i) + x2*h1
+#      ...
+#   enddo
+
+        addq      $16, %r11
+        .align 16
+1:
+        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
+        jge 2f
+
+        addq      %r8, %r10       # %r10 => q(.,i)
+
+        movddup    (%r11), %xmm14 # real(hh(i))
+        movddup   8(%r11), %xmm15 # imag(hh(i))
+
+        .macro mac_loop2 qoff, X, Y
+        movaps    \X, %xmm13
+        mulpd     %xmm14, %xmm13
+        movaps    \Y, %xmm12
+        mulpd     %xmm15, %xmm12
+        addsubpd  %xmm12, %xmm13
+        addpd     \qoff(%r10), %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_loop2   0, %xmm0, %xmm6
+        mac_loop2  16, %xmm1, %xmm7
+        .if \nrows>=4
+        mac_loop2  32, %xmm2, %xmm8
+        mac_loop2  48, %xmm3, %xmm9
+        .if \nrows==6
+        mac_loop2  64, %xmm4, %xmm10
+        mac_loop2  80, %xmm5, %xmm11
+        .endif
+        .endif
+        .purgem   mac_loop2
+
+        addq      $16, %r11
+        jmp       1b
+2:
+        .endm
+
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+# FORTRAN Interface:
+#
+# subroutine single_hh_trafo_complex_double_sse_assembly(q, hh, nb, nq, ldq)
+#
+#   integer, intent(in) :: nb, nq, ldq
+#   complex*16, intent(inout) :: q(ldq,*)
+#   complex*16, intent(in) :: hh(*)
+#
+# Parameter mapping to registers
+#   parameter 1: %rdi : q
+#   parameter 2: %rsi : hh
+#   parameter 3: %rdx : nb
+#   parameter 4: %rcx : nq
+#   parameter 5: %r8  : ldq
+#
+#-------------------------------------------------------------------------------
+#!f>#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
+#!f>  interface
+#!f>    subroutine single_hh_trafo_complex_double_sse_assembly(q, hh, nb, nq, ldq) &
+#!f>      bind(C,name="single_hh_trafo_complex_double_sse_assembly")
+#!f>      use, intrinsic :: iso_c_binding
+#!f>      integer(kind=c_int)              :: nb, nq, ldq
+#!f>      !complex(kind=c_double_complex)  :: q(*)
+#!f>      type(c_ptr), value               :: q
+#!f>      complex(kind=c_double_complex)   :: hh(nb,2)
+#!f>    end subroutine
+#!f>  end interface
+#!f>#endif
+        .align    16,0x90
+single_hh_trafo_complex_double_sse_assembly:
+
+        # Get integer parameters into corresponding registers
+
+        movslq    (%rdx), %rdx # nb
+        movslq    (%rcx), %rcx # nq
+        movslq    (%r8),  %r8  # ldq
+
+        # Get ldq in bytes
+        addq      %r8, %r8
+        addq      %r8, %r8
+        addq      %r8, %r8
+        addq      %r8, %r8 # 16*ldq, i.e. ldq in bytes
+
+cloop_s:
+        cmpq      $4, %rcx   # if %rcx <= 4 jump out of loop
+        jle       cloop_e
+        hh_trafo_complex 6 # transform 6 rows
+        addq      $96, %rdi  # increment q start adress by 96 bytes (6 rows)
+        subq      $6,  %rcx  # decrement nq
+        jmp       cloop_s
+cloop_e:
+
+        cmpq      $2, %rcx   # if %rcx <= 2 jump to test_2
+        jle       test_2
+        hh_trafo_complex 4 # transform 4 rows
+        jmp       return2
+
+test_2:
+        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
+        jle       return2
+        hh_trafo_complex 2 # transform 2 rows
+
+return2:
+        ret
+
+        .align    16,0x90
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+# Declare that we do not need an executable stack here
+	.section	.note.GNU-stack,"",@progbits
diff -Nru elpa-2016.05.001/src/elpa2/kernels/asm_x86_64_single_precision.s elpa-2019.11.001/src/elpa2/kernels/asm_x86_64_single_precision.s
--- elpa-2016.05.001/src/elpa2/kernels/asm_x86_64_single_precision.s	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/asm_x86_64_single_precision.s	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,789 @@
+#    This file is part of ELPA.
+#
+#    The ELPA library was originally created by the ELPA consortium,
+#    consisting of the following organizations:
+#
+#    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+#      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+#    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+#      Informatik,
+#    - Technische Universität München, Lehrstuhl für Informatik mit
+#      Schwerpunkt Wissenschaftliches Rechnen ,
+#    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+#    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+#      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+#      and
+#    - IBM Deutschland GmbH
+#
+#
+#    More information can be found here:
+#    http://elpa.mpcdf.mpg.de/
+#
+#    ELPA is free software: you can redistribute it and/or modify
+#    it under the terms of the version 3 of the license of the
+#    GNU Lesser General Public License as published by the Free
+#    Software Foundation.
+#
+#    ELPA is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU Lesser General Public License for more details.
+#
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+#
+#    ELPA reflects a substantial effort on the part of the original
+#    ELPA consortium, and we ask you to respect the spirit of the
+#    license that we chose: i.e., please contribute any changes you
+#    may have back to the original ELPA library distribution, and keep
+#    any derivatives of ELPA under the same license that we chose for
+#    the original distribution, the GNU Lesser General Public License.
+#
+# Author: Andreas Marek, MPCDF
+
+        .globl double_hh_trafo_real_single_sse_assembly
+        .globl single_hh_trafo_complex_single_sse_assembly
+
+	.text
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+        .macro hh_trafo_real_single nrows
+
+        # When this macro is called, the following registers are set and must not be changed
+        # %rdi: Address of q
+        # %rsi: Address of hh
+        # %rdx: nb
+        # %rcx: Remaining rows nq
+        # %r8:  ldq in bytes
+        # %r9:  ldh in bytes
+        # %rax: address of hh at the end of the loops
+        # The top of the stack must contain the dot product of the two Householder vectors
+
+        movq      %rdi, %r10   # Copy address of q
+        movq      %rsi, %r11   # Copy address of hh
+
+
+#   x1 = q(1,2)
+#   x2 = q(2,2)
+#
+#   y1 = q(1,1) + q(1,2)*hh(2,2)
+#   y2 = q(2,1) + q(2,2)*hh(2,2)
+
+# single precision implementation does not rely on complex packing !
+        movaps      (%r10), %xmm6       # y1 = q(1,1) ; copy content (16 bytes) starting at address %r10 (q(1,1)) into xmm6 (16 bytes = first 4 single precision values)
+                                        # y2 = q(2,1)
+					# y3 = q(3,1)
+					# y4 = q(4,1)
+        .if \nrows>=8
+        movaps    16(%r10), %xmm7       # y5 = q(5,1)  ; copy content od address r10+16 = q(5,1) into xmm7 (16 bytes = single precision values 5 and 6, 7, 8)
+	                                # y6 = q(6,1)
+					# y7 = q(7,1)
+					# y8 = q(8,1)
+        .if \nrows==12
+        movaps    32(%r10), %xmm8       # y9  = q(9,1)  ; copy content od address r10+32 = q(9,1) into xmm8 (16 bytes = single precision values 9 ,10, 11, 12)
+                                        # y10 = q(10,1)
+					# y11 = q(11,1)
+					# y12 = q(12,1)
+        .endif
+        .endif
+
+        addq      %r8, %r10             # %r10 => q(.,2)   # add to r10 ldq -> r10 now is q(*,2) r10 = r10 + r8
+	# carefull here ! we want to store in xmm9 four times the value of h(2,2) !
+        movddup   4(%r11,%r9), %xmm13   #  hh(2,2)         # copy from starting address r11 ldh bytes into xmm13 (wolud be hh(1,2)) shift by 4 bytes hh(2,2) and duplicate; xmm13 contains h(2,2), h(3,2), h(2,2), h(3,2)
+	movsldup   %xmm13, %xmm9        # copy the first 4 bytes (h(2,2)) and duplicate, the same for the third 4 bytes => xmm9 contains h(2,2), h(2,2), h(2,2), h(2,2)
+#        movshdup   %xmm13, %xmm9
+
+        .macro mac_pre_loop1_single qoff, X, Y
+        movaps    \qoff(%r10), \X       # xn = q(n,2)          # x = r10 + qoff = q(1+qoff,2) ; x contains the values q(1+qoff,2), q(2+qoff,2) , q(3+qoff,2), q(4+qoff,2) (=4 single precision floats)
+        movaps    \X, %xmm10                                   # copy x into xmm10 = q(1+qoff,2) .. q(4+qoff,2) = 4 single precision floats)
+        mulps     %xmm9, %xmm10                                # multiply 4 single precision values xmm9 (four times h(2,2)) with four single precision values q(1+qoff,2)..q(4+qoff,2) stored in xmm10; store result in xmm10
+        addps     %xmm10, \Y            # yn = yn + xn*h(2,2)  # add the four values in xmm10 (q(1+qoff,2)*h(2,2)..q(4+qoff,2)*h(2,2)) and \Y ; store in Y
+        .endm
+
+        mac_pre_loop1_single  0, %xmm0, %xmm6  # do the step y(1:4) = q(1:4,1) +q(1:4,2)*h(2,2) for the first 4 single precision floats
+        .if \nrows>=8
+        mac_pre_loop1_single 16, %xmm1, %xmm7 # for the next 4 floats
+        .if \nrows==12
+        mac_pre_loop1_single 32, %xmm2, %xmm8 # for the next 4 floats
+        .endif
+        .endif
+        .purgem   mac_pre_loop1_single
+
+#   do i=3,nb
+#      h1 = hh(i-1,1)
+#      h2 = hh(i,2)
+#      x1 = x1 + q(1,i)*h1
+#      y1 = y1 + q(1,i)*h2
+#      x2 = x2 + q(2,i)*h1
+#      y2 = y2 + q(2,i)*h2
+#      ...
+#   enddo
+
+        addq      $4, %r11              # r11 points to hh(1,1) + 4 bytes = hh(2,1)
+        .align 16
+1:
+        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
+        jge       2f
+
+        addq      %r8, %r10             # advance i %r10 => q(.,i)
+        # careful here we want xmm11 to contain four times the value of hh(i-1,1)
+        movddup   (%r11), %xmm13        # copy the first 8 bytes at r11 and duplicate ; xmm13 contains hh(i-1,1), hh(i,1), hh(i-1,1), hh(i,1)
+       movsldup   %xmm13, %xmm11        # copy the first 4 bytes (h(i-1,1)) and duplicate, the same for the third 4 bytes => xmm11 contains h(i-1,1), h(i-1,1), h(i-1,1), h(i-1,1)
+#        movshdup   %xmm13, %xmm11
+
+        # carefull here we want xmm9 to contain four times the value of hh(i,2)
+        movddup   4(%r11,%r9), %xmm13   # add to hh(i-1,1) ldh (r9) bytes => hh(i-1,2) add 4 extra bytes => hh(i,2) and duplicate ; xmm13 contains hh(i,2), hh(i+1,2), hh(i,2), hh(i+1,2)
+       movsldup   %xmm13, %xmm9        # copy the first 4 bytes (h(i,2)) and duplicate, the same for the third 4 bytes => xmm9 contains h(i,2), h(i,2), h(i,2), h(i,2)
+#        movshdup   %xmm13, %xmm9
+
+        .macro mac_loop1_single qoff, X, Y
+        movaps    \qoff(%r10), %xmm13   # q(.,i)  copy q(1,i), q(2,i), q(3,i), q(4,i) into xmm13
+        movaps    %xmm13, %xmm10        # copy q(1,i), q(2,i), q(3,i) and q(4,i) into xmm10
+        mulps     %xmm11, %xmm13        # multiply q(1,i), q(2,i), q(3,i), q(4,i) with  hh(i-1,i), h(i-1,1), h(i-1,1), h(i-1,1) ; store in xmm13
+        addps     %xmm13, \X            # xn = xn + q(.,i)*h1 ; add to h1*q(.,i) the valye of x store in x
+        mulps     %xmm9, %xmm10         # multiply hh(i,2), h(i,2), h(i,2), h(i,2) with q(1,i), q(2,i), q(3,i), q(4,i) store into xmm10
+        addps     %xmm10, \Y            # yn = yn + q(.,i)*h2 ; add q(.,i)*h2 to Y store in y
+        .endm
+
+        mac_loop1_single  0, %xmm0, %xmm6
+        .if \nrows>=8
+        mac_loop1_single 16, %xmm1, %xmm7
+        .if \nrows==12
+        mac_loop1_single 32, %xmm2, %xmm8
+        .endif
+        .endif
+        .purgem   mac_loop1_single
+
+        addq      $4, %r11
+        jmp       1b
+2:
+
+#   x1 = x1 + q(1,nb+1)*hh(nb,1)
+#   x2 = x2 + q(2,nb+1)*hh(nb,1)
+
+        addq      %r8, %r10             # %r10 => q(.,nb+1) # add ldq on q +> q(.,nb+1)
+	# careful here we want xm11 to contain four times the value hh(nb,1)
+        movddup   (%r11), %xmm13        # copy hh(nb,1) hh(nb+1,1) into xmm13 and duplicate
+       movsldup   %xmm13, %xmm11       # copy the first 4 bytes (h(nb,1)) and duplicate, the same for the third 4 bytes => xmm11 contains h(nb,1), h(nb,1), h(nb,1), h(nb,1)
+#        movshdup   %xmm13, %xmm11
+
+        .macro mac_post_loop1_single qoff, X
+        movaps    \qoff(%r10), %xmm13   # q(.,nb+1) copy q(1,nb+1), q(2,nb+1) q(3,nb+1), q(4,nb+1) into xmm13
+        mulps     %xmm11, %xmm13        # multiply hh(nb,1) hh(nb,1) hh(nb,1) hh(nb,1) with q(1,nb+1), q(2,nb+1) q(3,nb+1), q(4,nb+1) store in xmm13
+        addps     %xmm13, \X            # add hh(nb,1)*q(.,nb+1) and x store in x
+        .endm
+
+        mac_post_loop1_single  0, %xmm0
+        .if \nrows>=8
+        mac_post_loop1_single 16, %xmm1
+        .if \nrows==12
+        mac_post_loop1_single 32, %xmm2
+        .endif
+        .endif
+        .purgem   mac_post_loop1_single
+
+#   tau1 = hh(1,1)
+#   tau2 = hh(1,2)
+#
+#   h1 = -tau1
+#   x1 = x1*h1
+#   x2 = x2*h1
+
+        movq      %rsi, %r11    # restore %r11 (hh(1,1))
+
+	# carefull here we want xmm10 to contains for times the value hh(1,1)
+        movddup   (%r11), %xmm13        # copy hh(1,1) hh(2,1) into xmm13 and duplicate
+       movsldup   %xmm13, %xmm10       # copy the first 4 bytes (hh(n1,1)) and duplicate, the same for the third 4 bytes => xmm10 contains h(1,1), h(1,1), h(1,1), h(1,1)
+#        movshdup   %xmm13, %xmm10
+
+        xorps   %xmm11, %xmm11
+        subps   %xmm10, %xmm11 # %xmm11 = -hh(1,1)
+
+        mulps   %xmm11, %xmm0
+        .if \nrows>=8
+        mulps   %xmm11, %xmm1
+        .if \nrows==12
+        mulps   %xmm11, %xmm2
+        .endif
+        .endif
+
+
+#   h1 = -tau2
+#   h2 = -tau2*s
+#   y1 = y1*h1 + x1*h2
+#   y2 = y2*h1 + x2*h2
+
+	# careful here we want xmm12 to contain four times hh(1,2)
+        movddup (%r11,%r9), %xmm13  # xmm13 contains hh(1,2) hh(2,2) and duplicate
+       movsldup   %xmm13, %xmm10   # copy the first 4 bytes (hh(1,2)) and duplicate, the same for the third 4 bytes => xmm10 contains h(1,2), h(1,2), h(1,2), h(1,2)
+#        movshdup   %xmm13, %xmm10
+
+        xorps   %xmm9, %xmm9
+        subps   %xmm10, %xmm9       # %xmm9 = -hh(1,2) = h1
+        movaps  %xmm9, %xmm11
+
+	# careful here we want xmm10 to contain four times the value of s
+        movddup (%rsp), %xmm13      # Get s from top of stack plus unknown x and duplicate |s | x| s | x
+       movsldup   %xmm13, %xmm10   # copy the first 4 bytes (s) and duplicate, the same for the third 4 bytes => xmm10 contains s,s,s,s
+#        movshdup   %xmm13, %xmm10
+
+        mulps   %xmm10, %xmm11 # %xmm14 = h2
+
+        .macro mac_xform_y_single X, Y
+        mulps   %xmm9, \Y  # y1 = y1*h1
+        movaps  \X, %xmm10
+        mulps   %xmm11, %xmm10
+        addps   %xmm10, \Y
+        .endm
+
+        mac_xform_y_single %xmm0, %xmm6
+        .if \nrows>=8
+        mac_xform_y_single %xmm1, %xmm7
+        .if \nrows==12
+        mac_xform_y_single %xmm2, %xmm8
+        .endif
+        .endif
+        .purgem   mac_xform_y_single
+
+#   q(1,1) = q(1,1) + y1
+#   q(2,1) = q(2,1) + y2
+
+        movq   %rdi, %r10   # restore original Q
+
+        .macro mac_pre_loop2_1_single qoff, Y
+        movaps    \qoff(%r10), %xmm13   # q(.,1)
+        addps     \Y, %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_pre_loop2_1_single  0, %xmm6
+        .if \nrows>=8
+        mac_pre_loop2_1_single 16, %xmm7
+        .if \nrows==12
+        mac_pre_loop2_1_single 32, %xmm8
+        .endif
+        .endif
+        .purgem   mac_pre_loop2_1_single
+
+#   q(1,2) = q(1,2) + x1 + y1*hh(2,2)
+#   q(2,2) = q(2,2) + x2 + y2*hh(2,2)
+
+        addq      %r8, %r10             # %r10 => q(.,2)
+
+	# careful here we want xmm9 to contain 4 times the value of h(2,2)
+        movddup 4(%r11,%r9), %xmm13  # xmm13 contains hh(2,2) hh(2,3) and duplicate
+       movsldup   %xmm13, %xmm9     # copy the first 4 bytes (hh(2,2)) and duplicate, the same for the third 4 bytes => xmm10 contains h(2,2), h(2,2), h(2,2), h(2,2)
+#        movshdup   %xmm13, %xmm9
+
+        .macro mac_pre_loop2_2_single qoff, X, Y
+        movaps    \X, %xmm13
+        movaps    \Y, %xmm10
+        mulps     %xmm9, %xmm10
+        addps     %xmm10, %xmm13
+        addps     \qoff(%r10), %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_pre_loop2_2_single  0, %xmm0, %xmm6
+        .if \nrows>=8
+        mac_pre_loop2_2_single 16, %xmm1, %xmm7
+        .if \nrows==12
+        mac_pre_loop2_2_single 32, %xmm2, %xmm8
+        .endif
+        .endif
+        .purgem   mac_pre_loop2_2_single
+
+
+#   do i=3,nb
+#      h1 = hh(i-1,1)
+#      h2 = hh(i,2)
+#      q(1,i) = q(1,i) + x1*h1 + y1*h2
+#      q(2,i) = q(2,i) + x2*h1 + y2*h2
+#   enddo
+
+        addq      $4, %r11
+        .align 16
+1:
+        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
+        jge       2f
+
+        addq      %r8, %r10             # %r10 => q(.,i)
+
+	# careful here we want xmm11 to contain 4 times the value of hh(i-1,1)
+        movddup   (%r11), %xmm13      # hh(i-1,1) | hh(i,1) | hh(i-1,1) | hh(i,1)
+        movsldup   %xmm13, %xmm11     # copy the first 4 bytes hh(i-1,1)
+#         movshdup   %xmm13, %xmm11
+
+        # careful here we want xmm9 to contain 4 times the value of hh(i,2)
+        movddup   4(%r11,%r9), %xmm13   # hh(i,2) | hh(i+1,2) and duplicate
+        movsldup   %xmm13, %xmm9        # copy the first 4 bytes hh(i,2)
+#        movshdup   %xmm13, %xmm9
+
+        .macro mac_loop2_single qoff, X, Y
+        movaps    \X, %xmm13
+        mulps     %xmm11, %xmm13
+        movaps    \Y, %xmm10
+        mulps     %xmm9, %xmm10
+        addps     %xmm10, %xmm13
+        addps     \qoff(%r10), %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_loop2_single  0, %xmm0, %xmm6
+        .if \nrows>=8
+        mac_loop2_single 16, %xmm1, %xmm7
+        .if \nrows==12
+        mac_loop2_single 32, %xmm2, %xmm8
+        .endif
+        .endif
+        .purgem   mac_loop2_single
+
+        addq      $4, %r11
+        jmp       1b
+
+2:
+
+#   q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
+#   q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
+
+        addq      %r8, %r10             # %r10 => q(.,nb+1)
+
+	# carefule here we want xm11 to contain 4 times the value of hh(nb,1)
+        movddup   (%r11), %xmm13  # hh(nb,1) | hh(nb+1,1) and duplicate
+        movsldup   %xmm13, %xmm11 # copy the first 4 bytes hh(nb,1)
+#        movshdup   %xmm13, %xmm11
+
+        .macro mac_post_loop2_single qoff, X
+        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
+        mulps     %xmm11, \X
+        addps     \X, %xmm13
+        movaps    %xmm13, \qoff(%r10)
+        .endm
+
+        mac_post_loop2_single  0, %xmm0
+        .if \nrows>=8
+        mac_post_loop2_single 16, %xmm1
+        .if \nrows==12
+        mac_post_loop2_single 32, %xmm2
+        .endif
+        .endif
+        .purgem   mac_post_loop2_single
+
+        .endm
+
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+# FORTRAN Interface:
+#
+# subroutine double_hh_trafo_real_single_sse_assembly(q, hh, nb, nq, ldq, ldh)
+#
+#   integer, intent(in) :: nb, nq, ldq, ldh
+#   real*8, intent(inout) :: q(ldq,*)
+#   real*8, intent(in) :: hh(ldh,*)
+#
+# Parameter mapping to registers
+#   parameter 1: %rdi : q
+#   parameter 2: %rsi : hh
+#   parameter 3: %rdx : nb
+#   parameter 4: %rcx : nq
+#   parameter 5: %r8  : ldq
+#   parameter 6: %r9  : ldh
+#
+#-------------------------------------------------------------------------------
+#!f>#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
+#!f>#ifdef WANT_SINGLE_PRECISION_REAL
+#!f>  interface
+#!f>    subroutine double_hh_trafo_real_single_sse_assembly(q, hh, nb, nq, ldq, ldh) &
+#!f>      bind(C,name="double_hh_trafo_real_single_sse_assembly")
+#!f>      use, intrinsic :: iso_c_binding
+#!f>      integer(kind=c_int) :: nb, nq, ldq, ldh
+#!f>      type(c_ptr), value  :: q
+#!f>      real(kind=c_float)  :: hh(nb,6)
+#!f>    end subroutine
+#!f>  end interface
+#!f>#endif
+#!f>#endif
+        .align    16,0x90
+double_hh_trafo_real_single_sse_assembly:
+
+        # Get integer parameters into corresponding registers
+
+        movslq    (%rdx), %rdx # nb
+        movslq    (%rcx), %rcx # nq
+        movslq    (%r8),  %r8  # ldq
+        movslq    (%r9),  %r9  # ldh
+
+        # Get ldq in bytes
+        addq      %r8, %r8
+        addq      %r8, %r8 # 4*ldq, i.e. ldq in bytes
+
+        # Get ldh in bytes
+        addq      %r9, %r9
+        addq      %r9, %r9 # 4*ldq, i.e. ldh in bytes
+
+        # set %rax to the address of hh at the end of the loops,
+        # i.e. if %rdx >= %rax we must jump out of the loop.
+        # please note: %rax = 4*%rdx + %rsi - 4
+        movq %rdx, %rax
+        addq %rax, %rax
+        addq %rax, %rax
+        addq %rsi, %rax
+        subq $4, %rax
+
+#-----------------------------------------------------------
+        # Calculate the dot product of the two Householder vectors
+
+        # decrement stack pointer to make space for s
+        subq $4, %rsp
+
+#   Fortran code:
+#   s = hh(2,2)*1
+#   do i=3,nb
+#      s = s+hh(i,2)*hh(i-1,1)
+#   enddo
+
+        movq      %rsi, %r11   # Copy address of hh
+
+        movss     4(%r11,%r9), %xmm0 #  hh(2,2)
+        addq      $4, %r11
+1:
+        cmpq %rax, %r11
+        jge       2f
+        movss   (%r11), %xmm11       # hh(i-1,1)
+        movss   4(%r11,%r9), %xmm9   # hh(i,2)
+        mulss   %xmm11, %xmm9
+        addss   %xmm9, %xmm0
+        addq      $4, %r11
+        jmp       1b
+2:
+        movss   %xmm0, (%rsp)   # put s on top of stack
+#-----------------------------------------------------------
+
+rloop_single:
+        cmpq      $8, %rcx   # if %rcx <= 8 jump out of loop
+        jle       rloop_e
+        hh_trafo_real_single 12 # transform 12 rows
+        addq      $48, %rdi  # increment q start adress by 48 bytes (6 rows)
+        subq      $12, %rcx  # decrement nq
+        jmp       rloop_single
+
+rloop_e:
+        cmpq      $4, %rcx   # if %rcx <= 4 jump to test_2
+        jle       test_4
+        hh_trafo_real_single 8 # transform 8 rows
+        jmp       return1
+
+test_4:
+        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
+        jle       return1
+        hh_trafo_real_single 4 # transform 4 rows
+
+return1:
+        addq      $4, %rsp   # reset stack pointer
+        ret
+
+        .align    16,0x90
+
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+        .macro hh_trafo_complex_single nrows
+
+        # When this macro is called, the following registers are set and must not be changed
+        # %rdi: Address of q
+        # %rsi: Address of hh
+        # %rdx: nb
+        # %rcx: Remaining rows nq
+        # %r8:  ldq in bytes
+
+        movq      %rdi, %r10   # Copy address of q
+        movq      %rsi, %r11   # Copy address of hh
+
+        # set %rax to the address of hh at the end of the loops,
+        # i.e. if %rdx >= %rax we must jump out of the loop.
+        # please note: %rax = 8*%rdx + %rsi
+        movq %rdx, %rax
+        addq %rax, %rax
+        addq %rax, %rax
+        addq %rax, %rax # 8 * rax
+        addq %rsi, %rax
+
+#   x1 = q(1,1); y1 = 0
+#   x2 = q(2,1); y2 = 0
+#   ...
+
+        movaps      (%r10), %xmm0   # xmm0 now contains the first 16 bytes of q => TWO single precision complex q(1,1), q(2,1)
+        xorps     %xmm3, %xmm3
+        .if \nrows>=4
+        movaps    16(%r10), %xmm1   # xmm1 now contains the second 16 bytes of q => TWO single precision complex q(3,1), q(4,1)
+        xorps     %xmm4, %xmm4
+        .if \nrows==6
+        movaps    32(%r10), %xmm2  # xmm2 now contains the third 16 bytes of q => TWO single precision complex q(5,1), q(6,1)
+        xorps     %xmm5, %xmm5
+        .endif
+        .endif
+
+#   do i=2,nb
+#      h1 = conjg(hh(i))
+#      x1 = x1 + q(1,i)*h1
+#      x2 = x2 + q(2,i)*h1
+#      ...
+#   enddo
+
+        addq      $8, %r11  # %r11 => hh(2)
+        .align 16
+1:
+        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
+        jge 2f
+
+        addq      %r8, %r10       # %r10 => q(.,i)
+
+        # movddup    (%r11), %xmm7 # real(hh(i))
+        # movddup   8(%r11), %xmm8 # imag(hh(i))
+
+	# we use xmm6 as dummy variable
+	xorps     %xmm6, %xmm6
+
+	movddup    (%r11), %xmm6  # copy the single precision complex value h(i) in xmm6 and duplicate real(h(i)) | imag(h(i)) | real(h(i)) | imag(h(i))
+	movsldup   %xmm6, %xmm7   # copy the fist 4 bytes of xmm6 and duplicate in lower half, copy the third 4 bytes of xmm6 and duplicate in upper half -> real(h(i)), real(h(i)), real(h(i)), real(h(i))
+	movshdup   %xmm6, %xmm8   # as before but with 2nd 4bytes and fouth 4 bytes; xmm8 contains complex(h(i)), complex(h(i)), complex(h(i)), complex(h(i))
+
+#	movshdup   %xmm6, %xmm7   # copy the real part of h(i)  into xmm7 four times ; xmm7 contains real(h(i)), real(h(i)), real(h(i)), real(h(i))
+#	movsldup   %xmm6, %xmm8   # copy the complex part of h(i) into xmm8 four times ; xmm8 contains complex(h(i)), complex(h(i)), complex(h(i)), complex(h(i))
+
+
+
+        .macro mac_loop1_single qoff, X, Y
+        movaps    \qoff(%r10), %xmm13     # q(.,i) ; copy TWO single precision complex q(1,1) and q(2,1) into xmm6
+        movaps    %xmm13, %xmm9           # copy xmm6 into xmm9
+        mulps     %xmm7, %xmm13           # q(.,i)*real(hh(i)) # multiply real(hh(i)), real(h(i)), real(h(i)), real(h(i)) with TWO single precision COMPLEX q(1,1), q(2,1)
+        addps     %xmm13, \X              # x1 = x1 + q(.,i)*real(hh(i))  # add the four single precision parts
+        mulps     %xmm8, %xmm9            # q(.,i)*imag(hh(i))  # multiply contains complex(h(i)), complex(h(i)), complex(h(i)), complex(h(i)) with TWO single precision COMPLEX q(1,1), q(2,1)
+        addsubps  %xmm9, \Y               # y1 = y1 -/+ q(.,i)*imag(hh(i)) # add the four single precision parts
+        .endm
+
+        mac_loop1_single 0, %xmm0, %xmm3
+        .if \nrows>=4
+        mac_loop1_single 16, %xmm1, %xmm4
+        .if \nrows==6
+        mac_loop1_single  32, %xmm2, %xmm5
+        .endif
+        .endif
+
+        .purgem   mac_loop1_single
+
+        addq      $8, %r11                # %r11 => hh(i+1)
+        jmp       1b
+2:
+
+        # Now the content of the yn has to be swapped and added to xn
+        .macro mac_post_loop_1_single X, Y
+        shufps $0b10110001, \Y, \Y
+        addps  \Y, \X
+        .endm
+
+        mac_post_loop_1_single  %xmm0, %xmm3
+        .if \nrows>=4
+        mac_post_loop_1_single   %xmm1, %xmm4
+        .if \nrows==6
+        mac_post_loop_1_single   %xmm2, %xmm5
+        .endif
+        .endif
+        .purgem   mac_post_loop_1_single
+
+#   tau1 = hh(1)
+#
+#   h1 = -tau1
+#   x1 = x1*h1; y1 = x1 with halfes exchanged
+#   x2 = x2*h1; y2 = x2 with halfes exchanged
+#   ...
+
+        movq      %rsi, %r11      # restore address of hh
+
+	# copy four times the real part of hh(1) and change sign, same for complex part
+	# in the end xmm8 should be -im(hh(1)) | -im(hh(1)) | -im(hh(1)) | -im(hh(1))
+	# in the end xmm7 should be -re(hh(1)) | -re(hh(1)) | -re(hh(1)) | -re(hh(1))
+
+#        movddup    (%r11), %xmm9 # real(hh(1))
+#        movddup   8(%r11), %xmm7 # imag(hh(1))
+
+        xorps     %xmm10, %xmm10    # dummy variable
+	xorps     %xmm7, %xmm7
+	xorps     %xmm8, %xmm8
+
+	movddup    (%r11), %xmm6  # copy the single precision complex value h(i) in xmm6 and duplicate! xmm6 = re | im | re | im
+	subps     %xmm6, %xmm10    # change the signs of real and imaginary parts; xmm10 = - re | -im | -re | - im
+        movsldup   %xmm10, %xmm7  # copy the real part of -h(i)  into xmm7 four times ; xmm7 contains -real(h(i)), -real(h(i)), -real(h(i)), -real(h(i))
+        movshdup   %xmm10, %xmm8  # copy the complex part of h(i) into xmm8 four times ; xmm8 contains -complex(h(i)), -complex(h(i)), -complex(h(i)), -complex(h(i))
+#       movshdup   %xmm10, %xmm7  # copy the real part of -h(i)  into xmm7 four times ; xmm7 contains -real(h(i)), -real(h(i)), -real(h(i)), -real(h(i))
+#       movsldup   %xmm10, %xmm8  # copy the complex part of h(i) into xmm8 four times ; xmm8 contains -complex(h(i)), -complex(h(i)), -complex(h(i)), -complex(h(i))
+
+
+# maybe not neccessrary
+        xorps %xmm9, %xmm9
+
+        .macro mac_xform_single X, Y
+        movaps    \X, %xmm6
+        shufps    $0b10110001, \X, %xmm6
+        mulps     %xmm8, %xmm6
+        mulps     %xmm7, \X
+        addsubps  %xmm6, \X
+        movaps    \X, \Y          # copy to y
+        shufps    $0b10110001, \X, \Y
+        .endm
+
+        mac_xform_single %xmm0, %xmm3
+        .if \nrows>=4
+        mac_xform_single %xmm1, %xmm4
+        .if \nrows==6
+        mac_xform_single %xmm2, %xmm5
+        .endif
+        .endif
+        .purgem mac_xform_single
+
+#   q(1,1) = q(1,1) + x1
+#   q(2,1) = q(2,1) + x2
+#   ...
+
+        movq      %rdi, %r10      # restore address of q
+        .macro mac_pre_loop2_single qoff, X
+        movaps    \qoff(%r10), %xmm6     # q(.,1)
+        addps     \X, %xmm6
+        movaps    %xmm6, \qoff(%r10)
+        .endm
+
+        mac_pre_loop2_single   0, %xmm0
+        .if \nrows>=4
+        mac_pre_loop2_single  16, %xmm1
+        .if \nrows==6
+        mac_pre_loop2_single  32, %xmm2
+        .endif
+        .endif
+        .purgem mac_pre_loop2_single
+
+#   do i=2,nb
+#      h1 = hh(i)
+#      q(1,i) = q(1,i) + x1*h1
+#      q(2,i) = q(2,i) + x2*h1
+#      ...
+#   enddo
+
+        addq      $8, %r11
+        .align 16
+1:
+        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
+        jge 2f
+
+        addq      %r8, %r10       # %r10 => q(.,i)
+
+	# carefull here we want xmm7 to contain four times the value of real(hh(i))
+	# and xmm8 to contain four times the value of imag(hh(i))
+#        movddup    (%r11), %xmm7 # real(hh(i))
+#        movddup   8(%r11), %xmm8 # imag(hh(i))
+
+	movddup    (%r11), %xmm6  # copy the single precision complex value h(i) in xmm6 and duplicate ; real(h(i)) | imag(h(i)) | real(h(i)) | imag(h(i))
+        movsldup   %xmm6, %xmm7  # copy the real part of h(i)  into xmm7 four times ; xmm7 contains real(h(i)), real(h(i)), real(h(i)), real(h(i))
+        movshdup   %xmm6, %xmm8  # copy the complex part of h(i) into xmm8 four times ; xmm8 contains complex(h(i)), complex(h(i)), complex(h(i)), complex(h(i))
+
+        .macro mac_loop2_single qoff, X, Y
+        movaps    \X, %xmm6
+        mulps     %xmm7, %xmm6
+        movaps    \Y, %xmm9
+        mulps     %xmm8, %xmm9
+        addsubps  %xmm9, %xmm6
+        addps     \qoff(%r10), %xmm6
+        movaps    %xmm6, \qoff(%r10)
+        .endm
+
+        mac_loop2_single   0, %xmm0, %xmm3
+        .if \nrows>=4
+        mac_loop2_single  16, %xmm1, %xmm4
+        .if \nrows==6
+        mac_loop2_single  32, %xmm2, %xmm5
+        .endif
+        .endif
+        .purgem   mac_loop2_single
+
+        addq      $8, %r11
+        jmp       1b
+2:
+        .endm
+
+
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+# FORTRAN Interface:
+#
+# subroutine single_hh_trafo_complex_single_sse_assembly(q, hh, nb, nq, ldq)
+#
+#   integer, intent(in) :: nb, nq, ldq
+#   complex(kind=c_float_complex), intent(inout) :: q(ldq,*)
+#   complex(kind=c_float_complex), intent(in) :: hh(*)
+#
+# Parameter mapping to registers
+#   parameter 1: %rdi : q
+#   parameter 2: %rsi : hh
+#   parameter 3: %rdx : nb
+#   parameter 4: %rcx : nq
+#   parameter 5: %r8  : ldq
+#
+#-------------------------------------------------------------------------------
+#!f>#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
+#!f>#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#!f>  interface
+#!f>    subroutine single_hh_trafo_complex_single_sse_assembly(q, hh, nb, nq, ldq) &
+#!f>      bind(C,name="single_hh_trafo_complex_single_sse_assembly")
+#!f>      use, intrinsic :: iso_c_binding
+#!f>      integer(kind=c_int)            :: nb, nq, ldq
+#!f>      !complex(kind=c_float_complex) :: q(*)
+#!f>      type(c_ptr), value             :: q
+#!f>      complex(kind=c_float_complex)  :: hh(nb,2)
+#!f>    end subroutine
+#!f>  end interface
+#!f>#endif
+#!f>#endif
+
+        .align    16,0x90
+single_hh_trafo_complex_single_sse_assembly:
+
+        # Get integer parameters into corresponding registers
+
+        movslq    (%rdx), %rdx # nb
+        movslq    (%rcx), %rcx # nq
+        movslq    (%r8),  %r8  # ldq
+
+        # Get ldq in bytes
+        addq      %r8, %r8
+        addq      %r8, %r8
+        addq      %r8, %r8 # 8*ldq, i.e. ldq in bytes
+
+cloop_s:
+        cmpq      $4, %rcx   # if %rcx <= 4 jump out of loop
+        jle       cloop_e
+        hh_trafo_complex_single 6 # transform 6 rows
+        addq      $48, %rdi  # increment q start adress by 48 bytes (6 rows)
+        subq      $6,  %rcx  # decrement nq
+        jmp       cloop_s
+cloop_e:
+
+        cmpq      $2, %rcx   # if %rcx <= 2 jump to test_2
+        jle       test_2
+        hh_trafo_complex_single 4 # transform 4 rows
+        jmp       return2
+
+test_2:
+        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
+        jle       return2
+        hh_trafo_complex_single 2 # transform 2 rows
+
+return2:
+        ret
+
+        .align    16,0x90
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+# Declare that we do not need an executable stack here
+	.section	.note.GNU-stack,"",@progbits
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c elpa-2019.11.001/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,6111 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA. If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
+//
+#include "config-f90.h"
+
+#define CONCAT_8ARGS(a, b, c, d, e, f, g, h) CONCAT2_8ARGS(a, b, c, d, e, f, g, h)
+#define CONCAT2_8ARGS(a, b, c, d, e, f, g, h) a ## b ## c ## d ## e ## f ## g ## h
+
+#define CONCAT_7ARGS(a, b, c, d, e, f, g) CONCAT2_7ARGS(a, b, c, d, e, f, g)
+#define CONCAT2_7ARGS(a, b, c, d, e, f, g) a ## b ## c ## d ## e ## f ## g
+
+#define CONCAT_6ARGS(a, b, c, d, e, f) CONCAT2_6ARGS(a, b, c, d, e, f)
+#define CONCAT2_6ARGS(a, b, c, d, e, f) a ## b ## c ## d ## e ## f
+
+#define CONCAT_5ARGS(a, b, c, d, e) CONCAT2_5ARGS(a, b, c, d, e)
+#define CONCAT2_5ARGS(a, b, c, d, e) a ## b ## c ## d ## e
+
+#define CONCAT_4ARGS(a, b, c, d) CONCAT2_4ARGS(a, b, c, d)
+#define CONCAT2_4ARGS(a, b, c, d) a ## b ## c ## d
+
+#define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c)
+#define CONCAT2_3ARGS(a, b, c) a ## b ## c
+
+//define instruction set numbers
+#define SSE_128 128
+#define AVX_256 256
+#define AVX_512 512
+#define NEON_ARCH64_128 1285
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
+#include <x86intrin.h>
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 
+#include <pmmintrin.h>
+#endif
+#endif
+
+#define __forceinline __attribute__((always_inline))
+
+#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512 */
+
+#if VEC_SET == NEON_ARCH64_128
+#include <arm_neon.h>
+#endif
+
+#include <complex.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef BLOCK2
+#define PREFIX double
+#define BLOCK 2
+#endif
+
+#ifdef BLOCK1
+#define PREFIX single
+#define BLOCK 1
+#endif
+
+#if VEC_SET == SSE_128
+#define SIMD_SET SSE
+#endif
+
+#if VEC_SET == NEON_ARCH64_128
+#define SIMD_SET NEON_ARCH64
+#endif
+
+#if VEC_SET == AVX_256
+#define SIMD_SET AVX_AVX2
+#endif
+
+#if VEC_SET == AVX_512
+#define SIMD_SET AVX512
+#endif
+
+
+#if VEC_SET == SSE_128
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define offset 2
+#define __SIMD_DATATYPE __m128d
+#define _SIMD_LOAD _mm_load_pd
+#define _SIMD_LOADU _mm_loadu_pd
+#define _SIMD_STORE _mm_store_pd
+#define _SIMD_STOREU _mm_storeu_pd
+#define _SIMD_MUL _mm_mul_pd
+#define _SIMD_ADD _mm_add_pd
+#define _SIMD_XOR _mm_xor_pd
+#define _SIMD_ADDSUB _mm_addsub_pd
+#define _SIMD_SHUFFLE _mm_shuffle_pd
+#define _SHUFFLE _MM_SHUFFLE2(0,1)
+
+#ifdef __ELPA_USE_FMA__
+#define _SIMD_FMSUBADD _mm_maddsub_pd
+#endif
+#endif /* DOUBLE_PRECISION_COMPLEX */
+
+#ifdef SINGLE_PRECISION_COMPLEX
+#define offset 4
+#define __SIMD_DATATYPE __m128
+#define _SIMD_LOAD _mm_load_ps
+#define _SIMD_LOADU _mm_loadu_ps
+#define _SIMD_STORE _mm_store_ps
+#define _SIMD_STOREU _mm_storeu_ps
+#define _SIMD_MUL _mm_mul_ps
+#define _SIMD_ADD _mm_add_ps
+#define _SIMD_XOR _mm_xor_ps
+#define _SIMD_ADDSUB _mm_addsub_ps
+#define _SIMD_SHUFFLE _mm_shuffle_ps
+#define _SHUFFLE 0xb1
+
+#ifdef __ELPA_USE_FMA__
+#define _SIMD_FMSUBADD _mm_maddsub_ps
+#endif
+
+#endif /* SINGLE_PRECISION_COMPLEX */
+
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == NEON_128
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define offset 2
+#define __SIMD_DATATYPE __Float64x2_t
+#define _SIMD_LOAD vld1q_f64
+#define _SIMD_LOADU _mm_loadu_pd
+#define _SIMD_STORE vst1q_f64
+#define _SIMD_STOREU _mm_storeu_pd
+#define _SIMD_MUL vmulq_f64
+#define _SIMD_ADD vaddq_f64
+#define _SIMD_XOR _mm_xor_pd
+#define _SIMD_ADDSUB _mm_addsub_pd
+#define _SIMD_SHUFFLE _mm_shuffle_pd
+#define _SHUFFLE _MM_SHUFFLE2(0,1)
+
+#ifdef __ELPA_USE_FMA__
+#define _SIMD_FMSUBADD _mm_maddsub_pd
+#endif
+#endif /* DOUBLE_PRECISION_COMPLEX */
+
+#ifdef SINGLE_PRECISION_COMPLEX
+#define offset 4
+#define __SIMD_DATATYPE __m128
+#define _SIMD_LOAD _mm_load_ps
+#define _SIMD_LOADU _mm_loadu_ps
+#define _SIMD_STORE _mm_store_ps
+#define _SIMD_STOREU _mm_storeu_ps
+#define _SIMD_MUL _mm_mul_ps
+#define _SIMD_ADD _mm_add_ps
+#define _SIMD_XOR _mm_xor_ps
+#define _SIMD_ADDSUB _mm_addsub_ps
+#define _SIMD_SHUFFLE _mm_shuffle_ps
+#define _SHUFFLE 0xb1
+
+#ifdef __ELPA_USE_FMA__
+#define _SIMD_FMSUBADD _mm_maddsub_ps
+#endif
+
+#endif /* SINGLE_PRECISION_COMPLEX */
+
+#endif /* VEC_SET == NEON_128 */
+
+#if VEC_SET == AVX_256
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define offset 4
+#define __SIMD_DATATYPE __m256d
+#define _SIMD_LOAD _mm256_load_pd
+#define _SIMD_LOADU 1
+#define _SIMD_STORE _mm256_store_pd
+#define _SIMD_STOREU 1
+#define _SIMD_MUL _mm256_mul_pd
+#define _SIMD_ADD _mm256_add_pd
+#define _SIMD_XOR _mm256_xor_pd
+#define _SIMD_BROADCAST _mm256_broadcast_sd
+#define _SIMD_SET1 _mm256_set1_pd
+#define _SIMD_ADDSUB _mm256_addsub_pd
+#define _SIMD_SHUFFLE _mm256_shuffle_pd
+#define _SHUFFLE 0x5
+
+#ifdef HAVE_AVX2
+
+#ifdef __FMA4__
+#define __ELPA_USE_FMA__
+#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
+#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
+#endif
+
+#ifdef __AVX2__
+#define __ELPA_USE_FMA__
+#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
+#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
+#endif
+
+#define _SIMD_FMADDSUB _mm256_FMADDSUB_pd
+#define _SIMD_FMSUBADD _mm256_FMSUBADD_pd
+#endif /* HAVE_AVX2 */
+
+#endif /* DOUBLE_PRECISION_COMPLEX */
+
+#ifdef SINGLE_PRECISION_COMPLEX
+#define offset 8
+#define __SIMD_DATATYPE __m256
+#define _SIMD_LOAD _mm256_load_ps
+#define _SIMD_LOADU 1
+#define _SIMD_STORE _mm256_store_ps
+#define _SIMD_STOREU 1
+#define _SIMD_MUL _mm256_mul_ps
+#define _SIMD_ADD _mm256_add_ps
+#define _SIMD_XOR _mm256_xor_ps
+#define _SIMD_BROADCAST  _mm256_broadcast_ss
+#define _SIMD_SET1 _mm256_set1_ps
+#define _SIMD_ADDSUB _mm256_addsub_ps
+#define _SIMD_SHUFFLE _mm256_shuffle_ps
+#define _SHUFFLE 0xb1
+
+#ifdef HAVE_AVX2
+
+#ifdef __FMA4__
+#define __ELPA_USE_FMA__
+#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
+#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
+#endif
+
+#ifdef __AVX2__
+#define __ELPA_USE_FMA__
+#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
+#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
+#endif
+
+#define _SIMD_FMADDSUB _mm256_FMADDSUB_ps
+#define _SIMD_FMSUBADD _mm256_FMSUBADD_ps
+#endif /* HAVE_AVX2 */
+
+#endif /* SINGLE_PRECISION_COMPLEX */
+
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define offset 8
+#define __SIMD_DATATYPE __m512d
+#define _SIMD_LOAD _mm512_load_pd
+#define _SIMD_LOADU 1
+#define _SIMD_STORE _mm512_store_pd
+#define _SIMD_STOREU 1
+#define _SIMD_MUL _mm512_mul_pd
+#define _SIMD_ADD _mm512_add_pd
+#ifdef HAVE_AVX512_XEON
+#define _SIMD_XOR _mm512_xor_pd
+#endif
+#define _SIMD_BROADCAST 1
+#define _SIMD_SET1 _mm512_set1_pd
+#define _SIMD_SET _mm512_set_pd
+#define _SIMD_XOR_EPI _mm512_xor_epi64
+#define _SIMD_ADDSUB 1
+#define _SIMD_SHUFFLE _mm512_shuffle_pd
+#define _SIMD_MASK_STOREU _mm512_mask_storeu_pd
+#define _SHUFFLE 0x55
+
+#ifdef HAVE_AVX512
+#define __ELPA_USE_FMA__
+#define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
+#define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
+
+#define _SIMD_FMADDSUB _mm512_FMADDSUB_pd
+#define _SIMD_FMSUBADD _mm512_FMSUBADD_pd
+#endif /* HAVE_AVX512 */
+
+#endif /* DOUBLE_PRECISION_COMPLEX */
+
+#ifdef SINGLE_PRECISION_COMPLEX
+#define offset 16
+#define __SIMD_DATATYPE __m512
+#define _SIMD_LOAD _mm512_load_ps
+#define _SIMD_LOADU 1
+#define _SIMD_STORE _mm512_store_ps
+#define _SIMD_STOREU 1
+#define _SIMD_MUL _mm512_mul_ps
+#define _SIMD_ADD _mm512_add_ps
+#ifdef HAVE_AVX512_XEON
+#define _SIMD_XOR _mm512_xor_ps
+#endif
+#define _SIMD_BROADCAST 1
+#define _SIMD_SET1 _mm512_set1_ps
+#define _SIMD_SET _mm512_set_ps
+#define _SIMD_ADDSUB 1
+#define _SIMD_SHUFFLE _mm512_shuffle_ps
+#define _SIMD_MASK_STOREU _mm512_mask_storeu_ps
+#define _SIMD_XOR_EPI _mm512_xor_epi32
+#define _SHUFFLE 0xb1
+
+#ifdef HAVE_AVX512
+
+#define __ELPA_USE_FMA__
+#define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
+#define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
+
+#define _SIMD_FMADDSUB _mm512_FMADDSUB_ps
+#define _SIMD_FMSUBADD _mm512_FMSUBADD_ps
+#endif /* HAVE_AVX512 */
+
+#endif /* SINGLE_PRECISION_COMPLEX */
+
+#endif /* VEC_SET == AVX_512 */
+
+
+
+
+#define __forceinline __attribute__((always_inline))
+
+#ifdef HAVE_SSE_INTRINSICS
+#undef __AVX__
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define WORD_LENGTH double
+#define DATA_TYPE double complex
+#define DATA_TYPE_PTR double complex*
+#define DATA_TYPE_REAL double
+#define DATA_TYPE_REAL_PTR double*
+#endif
+
+#ifdef SINGLE_PRECISION_COMPLEX
+#define WORD_LENGTH single
+#define DATA_TYPE float complex
+#define DATA_TYPE_PTR float complex*
+#define DATA_TYPE_REAL float
+#define DATA_TYPE_REAL_PTR float*
+#endif
+
+
+//Forward declaration
+
+#if VEC_SET  == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 12
+#endif
+#endif /* VEC_SET  == SSE_128 */
+
+#if VEC_SET  == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET  == AVX_256 */
+
+#if VEC_SET  == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 48
+#endif
+#endif /* VEC_SET  == AVX_512 */
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq 
+#ifdef BLOCK1
+		                       );
+#endif
+#ifdef BLOCK2
+                                       ,int ldh, DATA_TYPE s);
+#endif
+
+#if VEC_SET  == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 5
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 10
+#endif
+#endif /* VEC_SET  == SSE_128 */
+
+#if VEC_SET  == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 10
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 20
+#endif
+#endif /* VEC_SET  == AVX_256 */
+
+#if VEC_SET  == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 20
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 40
+#endif
+#endif /* VEC_SET  == AVX_512 */
+
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		                       );
+#endif
+#ifdef BLOCK2
+                                       ,int ldh, DATA_TYPE s);
+#endif
+
+
+#if VEC_SET  == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET  == SSE_128 */
+
+#if VEC_SET  == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET  == AVX_256 */
+
+#if VEC_SET  == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET  == AVX_512 */
+
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		                       );
+#endif
+#ifdef BLOCK2
+                                       ,int ldh, DATA_TYPE s);
+#endif
+
+#if VEC_SET  == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 3
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 6
+#endif
+#endif /* VEC_SET  == SSE_128 */
+
+#if VEC_SET  == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 12
+#endif
+#endif /* VEC_SET  == AVX_256 */
+
+#if VEC_SET  == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET  == AVX_512 */
+
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		                       );
+#endif
+#ifdef BLOCK2
+                                       ,int ldh, DATA_TYPE s);
+#endif
+
+#if VEC_SET  == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET  == SSE_128 */
+
+#if VEC_SET  == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET  == AVX_256 */
+
+#if VEC_SET  == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET  == AVX_512 */
+
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		                       );
+#endif
+#ifdef BLOCK2
+                                       ,int ldh, DATA_TYPE s);
+#endif
+
+#if VEC_SET  == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 1
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 2
+#endif
+#endif /* VEC_SET  == SSE_128 */
+
+#if VEC_SET  == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET  == AVX_256 */
+
+#if VEC_SET  == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#undef ROW_LENGTH 
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET  == AVX_512 */
+
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		                       );
+#endif
+#ifdef BLOCK2
+                                       ,int ldh, DATA_TYPE s);
+#endif
+
+
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine single_hh_trafo_complex_SSE_1hv_double(q, hh, pnb, pnq, pldq) &
+!f>                             bind(C, name="single_hh_trafo_complex_SSE_1hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq
+!f>     ! complex(kind=c_double_complex)     :: q(*)
+!f>     type(c_ptr), value                   :: q
+!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine single_hh_trafo_complex_SSE_1hv_single(q, hh, pnb, pnq, pldq) &
+!f>                             bind(C, name="single_hh_trafo_complex_SSE_1hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq
+!f>     ! complex(kind=c_float_complex)   :: q(*)
+!f>     type(c_ptr), value                :: q
+!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine single_hh_trafo_complex_AVX_AVX2_1hv_double(q, hh, pnb, pnq, pldq) &
+!f>                             bind(C, name="single_hh_trafo_complex_AVX_AVX2_1hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq
+!f>     ! complex(kind=c_double_complex)     :: q(*)
+!f>     type(c_ptr), value                   :: q
+!f>     complex(kind=c_double_complex)       :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine single_hh_trafo_complex_AVX_AVX2_1hv_single(q, hh, pnb, pnq, pldq) &
+!f>                             bind(C, name="single_hh_trafo_complex_AVX_AVX2_1hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq
+!f>     ! complex(kind=c_float_complex)   :: q(*)
+!f>     type(c_ptr), value              :: q
+!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine single_hh_trafo_complex_AVX512_1hv_double(q, hh, pnb, pnq, pldq) &
+!f>                             bind(C, name="single_hh_trafo_complex_AVX512_1hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq
+!f>     ! complex(kind=c_double_complex)     :: q(*)
+!f>     type(c_ptr), value                 :: q
+!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine single_hh_trafo_complex_AVX512_1hv_single(q, hh, pnb, pnq, pldq) &
+!f>                             bind(C, name="single_hh_trafo_complex_AVX512_1hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq
+!f>     ! complex(kind=c_float_complex)     :: q(*)
+!f>     type(c_ptr), value                  :: q
+!f>     complex(kind=c_float_complex)     :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine double_hh_trafo_complex_SSE_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="double_hh_trafo_complex_SSE_2hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     ! complex(kind=c_double_complex)     :: q(*)
+!f>     type(c_ptr), value                   :: q
+!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine double_hh_trafo_complex_SSE_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="double_hh_trafo_complex_SSE_2hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     ! complex(kind=c_float_complex)   :: q(*)
+!f>     type(c_ptr), value                :: q
+!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine double_hh_trafo_complex_AVX_AVX2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        ! complex(kind=c_double_complex)     :: q(*)
+!f>        type(c_ptr), value                     :: q
+!f>        complex(kind=c_double_complex)           :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine double_hh_trafo_complex_AVX_AVX2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_single")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        ! complex(kind=c_float_complex)   :: q(*)
+!f>        type(c_ptr), value                  :: q
+!f>        complex(kind=c_float_complex)        :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine double_hh_trafo_complex_AVX512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="double_hh_trafo_complex_AVX512_2hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     ! complex(kind=c_double_complex)     :: q(*)
+!f>     type(c_ptr), value                   :: q
+!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine double_hh_trafo_complex_AVX512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="double_hh_trafo_complex_AVX512_2hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     ! complex(kind=c_float_complex)     :: q(*)
+!f>     type(c_ptr), value                  :: q
+!f>     complex(kind=c_float_complex)     :: hh(pnb,2)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+
+void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq
+#ifdef BLOCK1
+		  )
+#endif
+#ifdef BLOCK2
+                  ,int* pldh)
+#endif
+{
+
+     int i, worked_on;
+     int nb = *pnb;
+     int nq = *pldq;
+     int ldq = *pldq;
+#ifdef BLOCK2
+     int ldh = *pldh;
+
+     DATA_TYPE s = conj(hh[(ldh)+1])*1.0;
+
+     for (i = 2; i < nb; i++)
+     {
+             s += hh[i-1] * conj(hh[(i+ldh)]);
+     }
+#endif
+
+     worked_on = 0;
+
+#ifdef BLOCK1
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 6
+#define STEP_SIZE 6
+#define UPPER_BOUND 5
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#define STEP_SIZE 12
+#define UPPER_BOUND 10
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#define STEP_SIZE 12
+#define UPPER_BOUND 10
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 24
+#define STEP_SIZE 24
+#define UPPER_BOUND 20
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 24
+#define STEP_SIZE 24
+#define UPPER_BOUND 20
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 48
+#define STEP_SIZE 48
+#define UPPER_BOUND 40
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+        for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
+        {
+
+            CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
+	    worked_on += ROW_LENGTH;
+        }
+
+        if (nq == i) {
+          return;
+        }
+
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 5
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 10
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 10
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 20
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 20
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 40
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+        if (nq-i == ROW_LENGTH)
+        {
+            CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
+	    worked_on += ROW_LENGTH;
+        }
+
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+        if (nq-i == ROW_LENGTH)
+        {
+            CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
+	    worked_on += ROW_LENGTH;
+        }
+
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 3
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 6
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+        if (nq-i == ROW_LENGTH)
+        {
+            CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
+	    worked_on += ROW_LENGTH;
+        }
+
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+        if (nq-i == ROW_LENGTH)
+        {
+            CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
+	    worked_on += ROW_LENGTH;
+        }
+
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 1
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+        if (nq-i == ROW_LENGTH)
+        {
+            CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
+	    worked_on += ROW_LENGTH;
+        }
+
+#endif /* BLOCK1 */
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#define STEP_SIZE 4
+#define UPPER_BOUND 3
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#define STEP_SIZE 8
+#define UPPER_BOUND 6
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#define STEP_SIZE 8
+#define UPPER_BOUND 6
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#define STEP_SIZE 16
+#define UPPER_BOUND 12
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#define STEP_SIZE 16
+#define UPPER_BOUND 12
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 32
+#define STEP_SIZE 32
+#define UPPER_BOUND 24
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+    for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
+    {
+         CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+	 worked_on +=ROW_LENGTH;
+    }
+ 
+    if (nq == i)
+    {
+      return;
+    }
+    
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 3
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 6
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+    if (nq-i == ROW_LENGTH)
+    {
+        CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+        worked_on += ROW_LENGTH;
+    }
+
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+    if (nq-i == ROW_LENGTH)
+    {
+        CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+        worked_on += ROW_LENGTH;
+    }
+
+#if VEC_SET == SSE_128
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 1
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#undef ROW_LENGTH
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+    if (nq-i == ROW_LENGTH)
+    {
+        CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+        worked_on += ROW_LENGTH;
+    }
+
+#endif /* BLOCK2 */
+
+#ifdef WITH_DEBUG
+    if (worked_on != nq)
+    {
+      printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
+      abort();
+    }
+#endif
+
+}
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 48
+#endif
+#endif /* VEC_SET == AVX_512 */
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		)
+#endif
+#ifdef BLOCK2
+                ,int ldh, DATA_TYPE s)
+#endif
+{
+
+    DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
+    DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
+#ifdef BLOCK2
+    DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
+#endif
+
+    __SIMD_DATATYPE x1, x2, x3, x4, x5, x6;
+    __SIMD_DATATYPE q1, q2, q3, q4, q5, q6;
+#ifdef BLOCK2
+    __SIMD_DATATYPE y1, y2, y3, y4, y5, y6;
+    __SIMD_DATATYPE h2_real, h2_imag;
+#endif
+    __SIMD_DATATYPE h1_real, h1_imag;
+    __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+    int i=0;
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#ifdef BLOCK2
+     x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
+     x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
+     x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
+     x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
+     x5 = _SIMD_LOAD(&q_dbl[(2*ldq)+4*offset]);
+     x6 = _SIMD_LOAD(&q_dbl[(2*ldq)+5*offset]);
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /*  VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+     y1 = _SIMD_LOAD(&q_dbl[0]);
+     y2 = _SIMD_LOAD(&q_dbl[offset]);
+     y3 = _SIMD_LOAD(&q_dbl[2*offset]);
+     y4 = _SIMD_LOAD(&q_dbl[3*offset]);
+     y5 = _SIMD_LOAD(&q_dbl[4*offset]);
+     y6 = _SIMD_LOAD(&q_dbl[5*offset]);
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h2_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h2_imag, x5);
+#ifdef __ELPA_USE_FMA__
+     y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+     tmp6 = _SIMD_MUL(h2_imag, x6);
+#ifdef __ELPA_USE_FMA__
+     y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+     y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#ifdef BLOCK1
+    x1 = _SIMD_LOAD(&q_dbl[0]);
+    x2 = _SIMD_LOAD(&q_dbl[offset]);
+    x3 = _SIMD_LOAD(&q_dbl[2*offset]);
+    x4 = _SIMD_LOAD(&q_dbl[3*offset]);
+    x5 = _SIMD_LOAD(&q_dbl[4*offset]);
+    x6 = _SIMD_LOAD(&q_dbl[5*offset]);
+#endif
+
+    for (i = BLOCK; i < nb; i++)
+    {
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+       h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+       h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+       h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+       h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+        // conjugate
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+        q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+        q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+        q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
+        q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
+        q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
+        q6 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
+
+        tmp1 = _SIMD_MUL(h1_imag, q1);
+#ifdef __ELPA_USE_FMA__
+        x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+        x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+        tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+        x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+        x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+        tmp3 = _SIMD_MUL(h1_imag, q3);
+#ifdef __ELPA_USE_FMA__
+        x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+        x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+        tmp4 = _SIMD_MUL(h1_imag, q4);
+#ifdef __ELPA_USE_FMA__
+        x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+        x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+        tmp5 = _SIMD_MUL(h1_imag, q5);
+#ifdef __ELPA_USE_FMA__
+        x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+        x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+        tmp6 = _SIMD_MUL(h1_imag, q6);
+#ifdef __ELPA_USE_FMA__
+        x6 = _SIMD_ADD(x6, _SIMD_FMSUBADD(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+        x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+          tmp1 = _SIMD_MUL(h2_imag, q1);
+#ifdef __ELPA_USE_FMA__
+          y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, q2);
+#ifdef __ELPA_USE_FMA__
+          y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h2_imag, q3);
+#ifdef __ELPA_USE_FMA__
+          y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+          tmp4 = _SIMD_MUL(h2_imag, q4);
+#ifdef __ELPA_USE_FMA__
+          y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+          y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+          tmp5 = _SIMD_MUL(h2_imag, q5);
+#ifdef __ELPA_USE_FMA__
+          y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+          y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+          tmp6 = _SIMD_MUL(h2_imag, q6);
+#ifdef __ELPA_USE_FMA__
+          y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+          y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+	
+#endif /* BLOCK2 */
+
+    }
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
+     q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
+     q6 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+5*offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, q1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+     x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, q3);
+#ifdef __ELPA_USE_FMA__
+     x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h1_imag, q4);
+#ifdef __ELPA_USE_FMA__
+     x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h1_imag, q5);
+#ifdef __ELPA_USE_FMA__
+     x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+     tmp6 = _SIMD_MUL(h1_imag, q6);
+#ifdef __ELPA_USE_FMA__
+     x6 = _SIMD_ADD(x6, _SIMD_FMSUBADD(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+     x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+    h1_real = _mm_loaddup_pd(&hh_dbl[0]);
+    h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
+    h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
+#endif
+#endif /*  VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+    h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
+    h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+    h1_real = _SIMD_SET1(hh_dbl[0]);
+    h1_imag = _SIMD_SET1(hh_dbl[1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#endif
+#ifdef HAVE_AVX512_XEON
+#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+    h1_real = _SIMD_XOR(h1_real, sign);
+    h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+    tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+    x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+    tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+    x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+    x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+    tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+    x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#else
+    x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#endif
+
+    tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+    x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#else
+    x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#endif
+    tmp5 = _SIMD_MUL(h1_imag, x5);
+#ifdef __ELPA_USE_FMA__
+    x5 = _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
+#else
+    x5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
+#endif
+    tmp6 = _SIMD_MUL(h1_imag, x6);
+#ifdef __ELPA_USE_FMA__
+    x6 = _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
+#else
+    x6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128    
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+#endif /* VEC_SET == 128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+        h2_real = _SIMD_XOR(h2_real, sign);
+        h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+#endif     
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+     h2_real = _SIMD_XOR(h2_real, sign);
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+#if VEC_SET == SSE_128
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
+#else
+     tmp2 = _SIMD_LOADU(s_dbl);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
+                             s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+     tmp1 = _SIMD_MUL(h2_imag, tmp2);
+#ifdef __ELPA_USE_FMA__
+     tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+#if VEC_SET == AVX_512
+     _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
+
+     h2_real = _SIMD_SET1(s_dbl[0]);
+     h2_imag = _SIMD_SET1(s_dbl[1]);
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_movedup_pd(tmp2);
+     h2_imag = _mm_set1_pd(tmp2[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(tmp2);
+     h2_imag = _mm_movehdup_ps(tmp2);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_SET1(tmp2[0]);
+     h2_imag = _SIMD_SET1(tmp2[1]);
+#endif /* VEC_SET == AVX_256 */
+
+     tmp1 = _SIMD_MUL(h1_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+     y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, y3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#else
+     y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#endif
+     tmp4 = _SIMD_MUL(h1_imag, y4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#else
+     y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#endif
+
+     tmp5 = _SIMD_MUL(h1_imag, y5);
+#ifdef __ELPA_USE_FMA__
+     y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
+#else
+     y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
+#endif
+     tmp6 = _SIMD_MUL(h1_imag, y6);
+#ifdef __ELPA_USE_FMA__
+     y6 = _SIMD_FMADDSUB(h1_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
+#else
+     y6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
+#endif
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h2_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h2_imag, x5);
+#ifdef __ELPA_USE_FMA__
+     y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+     tmp6 = _SIMD_MUL(h2_imag, x6);
+#ifdef __ELPA_USE_FMA__
+     y6 = _SIMD_ADD(y6, _SIMD_FMADDSUB(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+     y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+    q1 = _SIMD_LOAD(&q_dbl[0]);
+    q2 = _SIMD_LOAD(&q_dbl[offset]);
+    q3 = _SIMD_LOAD(&q_dbl[2*offset]);
+    q4 = _SIMD_LOAD(&q_dbl[3*offset]);
+    q5 = _SIMD_LOAD(&q_dbl[4*offset]);
+    q6 = _SIMD_LOAD(&q_dbl[5*offset]);
+
+#ifdef BLOCK1
+    q1 = _SIMD_ADD(q1, x1);
+    q2 = _SIMD_ADD(q2, x2);
+    q3 = _SIMD_ADD(q3, x3);
+    q4 = _SIMD_ADD(q4, x4);
+    q5 = _SIMD_ADD(q5, x5);
+    q6 = _SIMD_ADD(q6, x6);
+#endif
+
+
+#ifdef BLOCK2
+    q1 = _SIMD_ADD(q1, y1);
+    q2 = _SIMD_ADD(q2, y2);
+    q3 = _SIMD_ADD(q3, y3);
+    q4 = _SIMD_ADD(q4, y4);
+    q5 = _SIMD_ADD(q5, y5);
+    q6 = _SIMD_ADD(q6, y6);
+#endif
+
+    _SIMD_STORE(&q_dbl[0], q1);
+    _SIMD_STORE(&q_dbl[offset], q2);
+    _SIMD_STORE(&q_dbl[2*offset], q3);
+    _SIMD_STORE(&q_dbl[3*offset], q4);
+    _SIMD_STORE(&q_dbl[4*offset], q5);
+    _SIMD_STORE(&q_dbl[5*offset], q6);
+
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+ 
+     q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
+     q5 = _SIMD_LOAD(&q_dbl[(ldq*2)+4*offset]);
+     q6 = _SIMD_LOAD(&q_dbl[(ldq*2)+5*offset]);
+
+     q1 = _SIMD_ADD(q1, x1);
+     q2 = _SIMD_ADD(q2, x2);
+     q3 = _SIMD_ADD(q3, x3);
+     q4 = _SIMD_ADD(q4, x4);
+     q5 = _SIMD_ADD(q5, x5);
+     q6 = _SIMD_ADD(q6, x6);
+
+     tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, y3);
+#ifdef __ELPA_USE_FMA__
+     q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h2_imag, y4);
+#ifdef __ELPA_USE_FMA__
+     q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h2_imag, y5);
+#ifdef __ELPA_USE_FMA__
+     q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+     tmp6 = _SIMD_MUL(h2_imag, y6);
+#ifdef __ELPA_USE_FMA__
+     q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+     q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
+     _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
+     _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
+     _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
+     _SIMD_STORE(&q_dbl[(ldq*2)+4*offset], q5);
+     _SIMD_STORE(&q_dbl[(ldq*2)+5*offset], q6);
+
+#endif /* BLOCK2 */
+
+
+    for (i = BLOCK; i < nb; i++)
+    {
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+        h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+        q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+        q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+        q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
+        q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
+        q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
+        q6 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
+
+        tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+        q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+        q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+        tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+        q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+        q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+        tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+        q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+        q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+         tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+         q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+         q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+         tmp5 = _SIMD_MUL(h1_imag, x5);
+#ifdef __ELPA_USE_FMA__
+         q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+         q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+         tmp6 = _SIMD_MUL(h1_imag, x6);
+#ifdef __ELPA_USE_FMA__
+         q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+         q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+	  h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+          tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+          q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+          q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h2_imag, y3);
+#ifdef __ELPA_USE_FMA__
+          q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+          tmp4 = _SIMD_MUL(h2_imag, y4);
+#ifdef __ELPA_USE_FMA__
+          q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+          q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+          tmp5 = _SIMD_MUL(h2_imag, y5);
+#ifdef __ELPA_USE_FMA__
+          q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+          q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+          tmp6 = _SIMD_MUL(h2_imag, y6);
+#ifdef __ELPA_USE_FMA__
+          q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+          q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
+    }
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128     
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+     
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
+     q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
+     q6 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+5*offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h1_imag, x5);
+#ifdef __ELPA_USE_FMA__
+     q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+     tmp6 = _SIMD_MUL(h1_imag, x6);
+#ifdef __ELPA_USE_FMA__
+     q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#else
+     q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+4*offset], q5);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+5*offset], q6);
+
+#endif /* BLOCK2 */
+
+}
+
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 5
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 10
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 10
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 20
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 20
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 40
+#endif
+#endif /* VEC_SET == AVX_512 */
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		)
+#endif
+#ifdef BLOCK2
+                ,int ldh, DATA_TYPE s)
+#endif
+{
+
+    DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
+    DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
+#ifdef BLOCK2
+    DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
+#endif
+
+    __SIMD_DATATYPE x1, x2, x3, x4, x5;
+    __SIMD_DATATYPE q1, q2, q3, q4, q5;
+#ifdef BLOCK2
+    __SIMD_DATATYPE y1, y2, y3, y4, y5;
+    __SIMD_DATATYPE h2_real, h2_imag;
+#endif
+    __SIMD_DATATYPE h1_real, h1_imag;
+    __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5;
+    int i=0;
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#ifdef BLOCK2
+     x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
+     x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
+     x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
+     x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
+     x5 = _SIMD_LOAD(&q_dbl[(2*ldq)+4*offset]);
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /*  VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+     y1 = _SIMD_LOAD(&q_dbl[0]);
+     y2 = _SIMD_LOAD(&q_dbl[offset]);
+     y3 = _SIMD_LOAD(&q_dbl[2*offset]);
+     y4 = _SIMD_LOAD(&q_dbl[3*offset]);
+     y5 = _SIMD_LOAD(&q_dbl[4*offset]);
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h2_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h2_imag, x5);
+#ifdef __ELPA_USE_FMA__
+     y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#ifdef BLOCK1
+    x1 = _SIMD_LOAD(&q_dbl[0]);
+    x2 = _SIMD_LOAD(&q_dbl[offset]);
+    x3 = _SIMD_LOAD(&q_dbl[2*offset]);
+    x4 = _SIMD_LOAD(&q_dbl[3*offset]);
+    x5 = _SIMD_LOAD(&q_dbl[4*offset]);
+#endif
+
+    for (i = BLOCK; i < nb; i++)
+    {
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+       h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+       h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+       h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+       h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+        // conjugate
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+        q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+        q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+        q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
+        q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
+        q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
+
+        tmp1 = _SIMD_MUL(h1_imag, q1);
+
+#ifdef __ELPA_USE_FMA__
+        x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+        x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+        tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+        x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+        x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+        tmp3 = _SIMD_MUL(h1_imag, q3);
+#ifdef __ELPA_USE_FMA__
+        x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+        x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+        tmp4 = _SIMD_MUL(h1_imag, q4);
+#ifdef __ELPA_USE_FMA__
+        x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+        x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+        tmp5 = _SIMD_MUL(h1_imag, q5);
+#ifdef __ELPA_USE_FMA__
+        x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+        x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+          tmp1 = _SIMD_MUL(h2_imag, q1);
+#ifdef __ELPA_USE_FMA__
+          y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, q2);
+#ifdef __ELPA_USE_FMA__
+          y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h2_imag, q3);
+#ifdef __ELPA_USE_FMA__
+          y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+          tmp4 = _SIMD_MUL(h2_imag, q4);
+#ifdef __ELPA_USE_FMA__
+          y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+          y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+          tmp5 = _SIMD_MUL(h2_imag, q5);
+#ifdef __ELPA_USE_FMA__
+          y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+          y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+	
+#endif /* BLOCK2 */
+
+    }
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
+     q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, q1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+     x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, q3);
+#ifdef __ELPA_USE_FMA__
+     x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h1_imag, q4);
+#ifdef __ELPA_USE_FMA__
+     x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h1_imag, q5);
+#ifdef __ELPA_USE_FMA__
+     x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+    h1_real = _mm_loaddup_pd(&hh_dbl[0]);
+    h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
+    h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
+#endif
+#endif /*  VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+    h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
+    h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
+#endif /* AVX_256 */
+
+#if VEC_SET == AVX_512
+    h1_real = _SIMD_SET1(hh_dbl[0]);
+    h1_imag = _SIMD_SET1(hh_dbl[1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#endif
+#ifdef HAVE_AVX512_XEON
+#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+    h1_real = _SIMD_XOR(h1_real, sign);
+    h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+    tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+    x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+    tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+    x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+    x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+    tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+    x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#else
+    x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#endif
+
+    tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+    x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#else
+    x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#endif
+    tmp5 = _SIMD_MUL(h1_imag, x5);
+#ifdef __ELPA_USE_FMA__
+    x5 = _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
+#else
+    x5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128       
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+        h2_real = _SIMD_XOR(h2_real, sign);
+        h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+#endif     
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+     h2_real = _SIMD_XOR(h2_real, sign);
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+#if VEC_SET == SSE_128
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
+#else
+     tmp2 = _SIMD_LOADU(s_dbl);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
+                             s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+     tmp1 = _SIMD_MUL(h2_imag, tmp2);
+#ifdef __ELPA_USE_FMA__
+     tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+#if VEC_SET == AVX_512
+     _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
+
+     h2_real = _SIMD_SET1(s_dbl[0]);
+     h2_imag = _SIMD_SET1(s_dbl[1]);
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_movedup_pd(tmp2);
+     h2_imag = _mm_set1_pd(tmp2[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(tmp2);
+     h2_imag = _mm_movehdup_ps(tmp2);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_SET1(tmp2[0]);
+     h2_imag = _SIMD_SET1(tmp2[1]);
+#endif /* VEC_SET == AVX_256 */
+     tmp1 = _SIMD_MUL(h1_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+     y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, y3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#else
+     y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#endif
+     tmp4 = _SIMD_MUL(h1_imag, y4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#else
+     y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#endif
+
+     tmp5 = _SIMD_MUL(h1_imag, y5);
+#ifdef __ELPA_USE_FMA__
+     y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
+#else
+     y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
+#endif
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h2_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h2_imag, x5);
+#ifdef __ELPA_USE_FMA__
+     y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+    q1 = _SIMD_LOAD(&q_dbl[0]);
+    q2 = _SIMD_LOAD(&q_dbl[offset]);
+    q3 = _SIMD_LOAD(&q_dbl[2*offset]);
+    q4 = _SIMD_LOAD(&q_dbl[3*offset]);
+    q5 = _SIMD_LOAD(&q_dbl[4*offset]);
+
+#ifdef BLOCK1
+    q1 = _SIMD_ADD(q1, x1);
+    q2 = _SIMD_ADD(q2, x2);
+    q3 = _SIMD_ADD(q3, x3);
+    q4 = _SIMD_ADD(q4, x4);
+    q5 = _SIMD_ADD(q5, x5);
+#endif
+
+
+#ifdef BLOCK2
+    q1 = _SIMD_ADD(q1, y1);
+    q2 = _SIMD_ADD(q2, y2);
+    q3 = _SIMD_ADD(q3, y3);
+    q4 = _SIMD_ADD(q4, y4);
+    q5 = _SIMD_ADD(q5, y5);
+#endif
+    _SIMD_STORE(&q_dbl[0], q1);
+    _SIMD_STORE(&q_dbl[offset], q2);
+    _SIMD_STORE(&q_dbl[2*offset], q3);
+    _SIMD_STORE(&q_dbl[3*offset], q4);
+    _SIMD_STORE(&q_dbl[4*offset], q5);
+
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
+     q5 = _SIMD_LOAD(&q_dbl[(ldq*2)+4*offset]);
+
+     q1 = _SIMD_ADD(q1, x1);
+     q2 = _SIMD_ADD(q2, x2);
+     q3 = _SIMD_ADD(q3, x3);
+     q4 = _SIMD_ADD(q4, x4);
+     q5 = _SIMD_ADD(q5, x5);
+
+     tmp1 = _SIMD_MUL(h2_imag, y1);
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, y3);
+#ifdef __ELPA_USE_FMA__
+     q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h2_imag, y4);
+#ifdef __ELPA_USE_FMA__
+     q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h2_imag, y5);
+#ifdef __ELPA_USE_FMA__
+     q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
+     _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
+     _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
+     _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
+     _SIMD_STORE(&q_dbl[(ldq*2)+4*offset], q5);
+
+#endif /* BLOCK2 */
+
+
+    for (i = BLOCK; i < nb; i++)
+    {
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+        h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+        q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+        q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+        q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
+        q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
+        q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
+
+	tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+        q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+        q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+        tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+        q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+        q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+        tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+        q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+        q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+         tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+         q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+         q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+         tmp5 = _SIMD_MUL(h1_imag, x5);
+#ifdef __ELPA_USE_FMA__
+         q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+         q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+	  h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+          tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+          q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+          q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h2_imag, y3);
+#ifdef __ELPA_USE_FMA__
+          q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+          tmp4 = _SIMD_MUL(h2_imag, y4);
+#ifdef __ELPA_USE_FMA__
+          q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+          q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+          tmp5 = _SIMD_MUL(h2_imag, y5);
+#ifdef __ELPA_USE_FMA__
+          q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+          q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
+         _SIMD_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
+    }
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128       
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
+     q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     tmp5 = _SIMD_MUL(h1_imag, x5);
+#ifdef __ELPA_USE_FMA__
+     q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#else
+     q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+4*offset], q5);
+
+#endif /* BLOCK2 */
+
+}
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_512 */
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		)
+#endif
+#ifdef BLOCK2
+                ,int ldh, DATA_TYPE s)
+#endif
+{
+    DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
+    DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
+#ifdef BLOCK2
+    DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
+#endif
+
+    __SIMD_DATATYPE x1, x2, x3, x4;
+    __SIMD_DATATYPE q1, q2, q3, q4;
+#ifdef BLOCK2
+    __SIMD_DATATYPE y1, y2, y3, y4;
+    __SIMD_DATATYPE h2_real, h2_imag;
+#endif
+    __SIMD_DATATYPE h1_real, h1_imag;
+    __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4;
+    int i=0;
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#ifdef BLOCK2
+     x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
+     x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
+     x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
+     x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /*  VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+     y1 = _SIMD_LOAD(&q_dbl[0]);
+     y2 = _SIMD_LOAD(&q_dbl[offset]);
+     y3 = _SIMD_LOAD(&q_dbl[2*offset]);
+     y4 = _SIMD_LOAD(&q_dbl[3*offset]);
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+     tmp4 = _SIMD_MUL(h2_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#ifdef BLOCK1
+     x1 = _SIMD_LOAD(&q_dbl[0]);
+     x2 = _SIMD_LOAD(&q_dbl[offset]);
+     x3 = _SIMD_LOAD(&q_dbl[2*offset]);
+     x4 = _SIMD_LOAD(&q_dbl[3*offset]);
+#endif
+
+     for (i = BLOCK; i < nb; i++)
+     {
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+          h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+          q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+          q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+          q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
+          q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
+
+          tmp1 = _SIMD_MUL(h1_imag, q1);
+
+#ifdef __ELPA_USE_FMA__
+          x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+          tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+          x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h1_imag, q3);
+#ifdef __ELPA_USE_FMA__
+          x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+          tmp4 = _SIMD_MUL(h1_imag, q4);
+#ifdef __ELPA_USE_FMA__
+          x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+          x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+          tmp1 = _SIMD_MUL(h2_imag, q1);
+#ifdef __ELPA_USE_FMA__
+          y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, q2);
+#ifdef __ELPA_USE_FMA__
+          y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h2_imag, q3);
+#ifdef __ELPA_USE_FMA__
+          y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+          tmp4 = _SIMD_MUL(h2_imag, q4);
+#ifdef __ELPA_USE_FMA__
+          y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+          y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+#endif /* BLOCK2 */
+     }
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, q1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+     x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, q3);
+#ifdef __ELPA_USE_FMA__
+     x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+     tmp4 = _SIMD_MUL(h1_imag, q4);
+#ifdef __ELPA_USE_FMA__
+     x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[0]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+    h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
+    h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
+#endif /* AVX_256 */
+
+#if VEC_SET == AVX_512
+    h1_real = _SIMD_SET1(hh_dbl[0]);
+    h1_imag = _SIMD_SET1(hh_dbl[1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#endif
+#ifdef HAVE_AVX512_XEON
+#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+     tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+     x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#else
+     x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#endif
+
+     tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#else
+     x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128       
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+#endif /* VEC_SET == 128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+#if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+        h2_real = _SIMD_XOR(h2_real, sign);
+        h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+#endif     
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+     h2_real = _SIMD_XOR(h2_real, sign);
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+#if VEC_SET == SSE_128
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
+#else
+     tmp2 = _SIMD_LOADU(s_dbl);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
+                             s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+     tmp1 = _SIMD_MUL(h2_imag, tmp2);
+#ifdef __ELPA_USE_FMA__
+     tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+#if VEC_SET == AVX_512
+     _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
+
+     h2_real = _SIMD_SET1(s_dbl[0]);
+     h2_imag = _SIMD_SET1(s_dbl[1]);
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_movedup_pd(tmp2);
+     h2_imag = _mm_set1_pd(tmp2[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(tmp2);
+     h2_imag = _mm_movehdup_ps(tmp2);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_SET1(tmp2[0]);
+     h2_imag = _SIMD_SET1(tmp2[1]);
+#endif /* VEC_SET == AVX_256 */
+     tmp1 = _SIMD_MUL(h1_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+     tmp2 = _SIMD_MUL(h1_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+     y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, y3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#else
+     y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#endif
+
+     tmp4 = _SIMD_MUL(h1_imag, y4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#else
+     y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
+#endif
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+     tmp4 = _SIMD_MUL(h2_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+     q1 = _SIMD_LOAD(&q_dbl[0]);
+     q2 = _SIMD_LOAD(&q_dbl[offset]);
+     q3 = _SIMD_LOAD(&q_dbl[2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[3*offset]);
+
+#ifdef BLOCK1
+     q1 = _SIMD_ADD(q1, x1);
+     q2 = _SIMD_ADD(q2, x2);
+     q3 = _SIMD_ADD(q3, x3);
+     q4 = _SIMD_ADD(q4, x4);
+#endif
+
+#ifdef BLOCK2
+     q1 = _SIMD_ADD(q1, y1);
+     q2 = _SIMD_ADD(q2, y2);
+     q3 = _SIMD_ADD(q3, y3);
+     q4 = _SIMD_ADD(q4, y4);
+#endif
+
+     _SIMD_STORE(&q_dbl[0], q1);
+     _SIMD_STORE(&q_dbl[offset], q2);
+     _SIMD_STORE(&q_dbl[2*offset], q3);
+     _SIMD_STORE(&q_dbl[3*offset], q4);
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
+
+     q1 = _SIMD_ADD(q1, x1);
+     q2 = _SIMD_ADD(q2, x2);
+     q3 = _SIMD_ADD(q3, x3);
+     q4 = _SIMD_ADD(q4, x4);
+
+     tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, y3);
+#ifdef __ELPA_USE_FMA__
+     q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+     tmp4 = _SIMD_MUL(h2_imag, y4);
+#ifdef __ELPA_USE_FMA__
+     q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
+     _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
+     _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
+     _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
+
+#endif /* BLOCK2 */
+
+     for (i = BLOCK; i < nb; i++)
+     {
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+          h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	  h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+          q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+          q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+          q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
+          q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
+
+          tmp1 = _SIMD_MUL(h1_imag, x1);
+
+#ifdef __ELPA_USE_FMA__
+          q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+          q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+          q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+          tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+          q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+          q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+	  h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+          tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+          q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+          q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h2_imag, y3);
+#ifdef __ELPA_USE_FMA__
+          q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+          tmp4 = _SIMD_MUL(h2_imag, y4);
+#ifdef __ELPA_USE_FMA__
+          q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+          q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
+
+     }
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128     
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
+     q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+     tmp4 = _SIMD_MUL(h1_imag, x4);
+#ifdef __ELPA_USE_FMA__
+     q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#else
+     q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
+
+#endif /* BLOCK2 */
+}
+
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 3
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 6
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		)
+#endif
+#ifdef BLOCK2
+                ,int ldh, DATA_TYPE s)
+#endif
+{
+    DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
+    DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
+#ifdef BLOCK2
+    DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
+#endif
+
+    __SIMD_DATATYPE x1, x2, x3;
+    __SIMD_DATATYPE q1, q2, q3;
+#ifdef BLOCK2
+    __SIMD_DATATYPE y1, y2, y3;
+    __SIMD_DATATYPE h2_real, h2_imag;
+#endif
+    __SIMD_DATATYPE h1_real, h1_imag;
+    __SIMD_DATATYPE tmp1, tmp2, tmp3;
+    int i=0;
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#ifdef BLOCK2
+     x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
+     x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
+     x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /*  VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+     y1 = _SIMD_LOAD(&q_dbl[0]);
+     y2 = _SIMD_LOAD(&q_dbl[offset]);
+     y3 = _SIMD_LOAD(&q_dbl[2*offset]);
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#ifdef BLOCK1
+     x1 = _SIMD_LOAD(&q_dbl[0]);
+     x2 = _SIMD_LOAD(&q_dbl[offset]);
+     x3 = _SIMD_LOAD(&q_dbl[2*offset]);
+#endif
+
+     for (i = BLOCK; i < nb; i++)
+     {
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+          h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+          q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+          q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+          q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
+
+          tmp1 = _SIMD_MUL(h1_imag, q1);
+
+#ifdef __ELPA_USE_FMA__
+          x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+          tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+          x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h1_imag, q3);
+#ifdef __ELPA_USE_FMA__
+          x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+          tmp1 = _SIMD_MUL(h2_imag, q1);
+#ifdef __ELPA_USE_FMA__
+          y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, q2);
+#ifdef __ELPA_USE_FMA__
+          y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h2_imag, q3);
+#ifdef __ELPA_USE_FMA__
+          y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+#endif /* BLOCK2 */
+     }
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, q1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+     x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, q3);
+#ifdef __ELPA_USE_FMA__
+     x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[0]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+    h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
+    h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+    h1_real = _SIMD_SET1(hh_dbl[0]);
+    h1_imag = _SIMD_SET1(hh_dbl[1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+     tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+     x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#else
+     x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128        
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+#endif /* VEC_SET == 128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+        h2_real = _SIMD_XOR(h2_real, sign);
+        h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif     
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+     h2_real = _SIMD_XOR(h2_real, sign);
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+#if VEC_SET == SSE_128
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
+#else
+     tmp2 = _SIMD_LOADU(s_dbl);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
+                             s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0],
+                        s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+     tmp1 = _SIMD_MUL(h2_imag, tmp2);
+#ifdef __ELPA_USE_FMA__
+     tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+#if VEC_SET == AVX_512
+     _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
+
+     h2_real = _SIMD_SET1(s_dbl[0]);
+     h2_imag = _SIMD_SET1(s_dbl[1]);
+#endif
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_movedup_pd(tmp2);
+     h2_imag = _mm_set1_pd(tmp2[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(tmp2);
+     h2_imag = _mm_movehdup_ps(tmp2);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_SET1(tmp2[0]);
+     h2_imag = _SIMD_SET1(tmp2[1]);
+#endif /* VEC_SET == AVX_256 */
+
+     tmp1 = _SIMD_MUL(h1_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+     tmp2 = _SIMD_MUL(h1_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+     y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, y3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#else
+     y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
+#endif
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+     q1 = _SIMD_LOAD(&q_dbl[0]);
+     q2 = _SIMD_LOAD(&q_dbl[offset]);
+     q3 = _SIMD_LOAD(&q_dbl[2*offset]);
+
+#ifdef BLOCK1
+     q1 = _SIMD_ADD(q1, x1);
+     q2 = _SIMD_ADD(q2, x2);
+     q3 = _SIMD_ADD(q3, x3);
+#endif
+
+#ifdef BLOCK2
+     q1 = _SIMD_ADD(q1, y1);
+     q2 = _SIMD_ADD(q2, y2);
+     q3 = _SIMD_ADD(q3, y3);
+#endif
+
+     _SIMD_STORE(&q_dbl[0], q1);
+     _SIMD_STORE(&q_dbl[offset], q2);
+     _SIMD_STORE(&q_dbl[2*offset], q3);
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
+
+     q1 = _SIMD_ADD(q1, x1);
+     q2 = _SIMD_ADD(q2, x2);
+     q3 = _SIMD_ADD(q3, x3);
+
+     tmp1 = _SIMD_MUL(h2_imag, y1);
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h2_imag, y3);
+#ifdef __ELPA_USE_FMA__
+     q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
+     _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
+     _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
+
+#endif /* BLOCK2 */
+
+     for (i = BLOCK; i < nb; i++)
+     {
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+          h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+        h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+          q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+          q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+          q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
+
+          tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+          q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+          q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+          q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+        h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+        h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+          tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+          q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+          q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+          tmp3 = _SIMD_MUL(h2_imag, y3);
+#ifdef __ELPA_USE_FMA__
+          q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+          q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
+
+     }
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128     
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+     q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     tmp3 = _SIMD_MUL(h1_imag, x3);
+#ifdef __ELPA_USE_FMA__
+     q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#else
+     q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
+
+#endif /* BLOCK2 */
+}
+
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		)
+#endif
+#ifdef BLOCK2
+                ,int ldh, DATA_TYPE s)
+#endif
+{
+
+     DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
+     DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
+#ifdef BLOCK2
+     DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
+#endif
+
+     __SIMD_DATATYPE x1, x2;
+     __SIMD_DATATYPE q1, q2;
+#ifdef BLOCK2
+     __SIMD_DATATYPE y1, y2;
+     __SIMD_DATATYPE h2_real, h2_imag;
+#endif
+     __SIMD_DATATYPE h1_real, h1_imag;
+     __SIMD_DATATYPE tmp1, tmp2;
+     int i=0;
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#ifdef BLOCK2
+     x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
+     x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+     y1 = _SIMD_LOAD(&q_dbl[0]);
+     y2 = _SIMD_LOAD(&q_dbl[offset]);
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#ifdef BLOCK1
+     x1 = _SIMD_LOAD(&q_dbl[0]);
+     x2 = _SIMD_LOAD(&q_dbl[offset]);
+#endif
+
+     for (i = BLOCK; i < nb; i++)
+     {
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+          h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+          q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+          q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+          tmp1 = _SIMD_MUL(h1_imag, q1);
+
+#ifdef __ELPA_USE_FMA__
+          x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+          tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+          x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+          tmp1 = _SIMD_MUL(h2_imag, q1);
+#ifdef __ELPA_USE_FMA__
+          y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, q2);
+#ifdef __ELPA_USE_FMA__
+          y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+     }
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, q1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, q2);
+#ifdef __ELPA_USE_FMA__
+     x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[0]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+    h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
+    h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+    h1_real = _SIMD_SET1(hh_dbl[0]);
+    h1_imag = _SIMD_SET1(hh_dbl[1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#endif
+#ifdef HAVE_AVX512_XEON
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+     tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+     x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128     
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+#endif /* VEC_SET == 128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
+#endif
+#endif
+#ifdef HAVE_AVX512_XEON
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+     h2_real = _SIMD_XOR(h2_real, sign);
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+     h2_real = _SIMD_XOR(h2_real, sign);
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+#if VEC_SET == SSE_128
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
+#else
+     tmp2 = _SIMD_LOADU(s_dbl);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
+                             s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
+                          s_dbl[1], s_dbl[0],
+                          s_dbl[1], s_dbl[0],
+                          s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+     tmp1 = _SIMD_MUL(h2_imag, tmp2);
+#ifdef __ELPA_USE_FMA__
+     tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+#if VEC_SET == AVX_512
+     _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
+
+     h2_real = _SIMD_SET1(s_dbl[0]);
+     h2_imag = _SIMD_SET1(s_dbl[1]);
+#endif
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_movedup_pd(tmp2);
+     h2_imag = _mm_set1_pd(tmp2[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(tmp2);
+     h2_imag = _mm_movehdup_ps(tmp2);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_SET1(tmp2[0]);
+     h2_imag = _SIMD_SET1(tmp2[1]);
+#endif /* VEC_SET == AVX_256 */
+
+     tmp1 = _SIMD_MUL(h1_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#else
+     y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
+#endif
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+     q1 = _SIMD_LOAD(&q_dbl[0]);
+     q2 = _SIMD_LOAD(&q_dbl[offset]);
+
+#ifdef BLOCK1
+     q1 = _SIMD_ADD(q1, x1);
+     q2 = _SIMD_ADD(q2, x2);
+#endif
+
+#ifdef BLOCK2
+     q1 = _SIMD_ADD(q1, y1);
+     q2 = _SIMD_ADD(q2, y2);
+#endif
+     _SIMD_STORE(&q_dbl[0], q1);
+     _SIMD_STORE(&q_dbl[offset], q2);
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
+
+     q1 = _SIMD_ADD(q1, x1);
+     q2 = _SIMD_ADD(q2, x2);
+
+     tmp1 = _SIMD_MUL(h2_imag, y1);
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
+     _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
+
+#endif /* BLOCK2 */
+
+     for (i = BLOCK; i < nb; i++)
+     {
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+          h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	  h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+          q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+          q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
+          tmp1 = _SIMD_MUL(h1_imag, x1);
+
+#ifdef __ELPA_USE_FMA__
+          q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+          tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+          q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+         h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+         h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+          tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+          q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+          tmp2 = _SIMD_MUL(h2_imag, y2);
+#ifdef __ELPA_USE_FMA__
+          q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+          q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
+          _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
+    }
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128     
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+     q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+     tmp2 = _SIMD_MUL(h1_imag, x2);
+#ifdef __ELPA_USE_FMA__
+     q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#else
+     q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
+
+#endif /* BLOCK2 */
+
+}
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 1
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
+#ifdef BLOCK1
+		)
+#endif
+#ifdef BLOCK2
+                ,int ldh, DATA_TYPE s)
+#endif
+{
+
+     DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
+     DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
+#ifdef BLOCK2
+     DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
+#endif
+
+     __SIMD_DATATYPE x1;
+     __SIMD_DATATYPE q1;
+#ifdef BLOCK2
+     __SIMD_DATATYPE y1;
+     __SIMD_DATATYPE h2_real, h2_imag;
+#endif
+     __SIMD_DATATYPE h1_real, h1_imag;
+     __SIMD_DATATYPE tmp1, tmp2;
+     int i=0;
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#ifdef BLOCK2
+     x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+     y1 = _SIMD_LOAD(&q_dbl[0]);
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#ifdef BLOCK1
+     x1 = _SIMD_LOAD(&q_dbl[0]);
+#endif
+
+     for (i = BLOCK; i < nb; i++)
+     {
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+          h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+          h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+         h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+         h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+          q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+
+          tmp1 = _SIMD_MUL(h1_imag, q1);
+#ifdef __ELPA_USE_FMA__
+          x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+          h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+          h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+          h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+          h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+          h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+          // conjugate
+          h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+          tmp1 = _SIMD_MUL(h2_imag, q1);
+#ifdef __ELPA_USE_FMA__
+          y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+          y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+     }
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+#ifndef __ELPA_USE_FMA__
+     // conjugate
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+
+     tmp1 = _SIMD_MUL(h1_imag, q1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[0]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+    h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
+    h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
+#endif /*  VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+    h1_real = _SIMD_SET1(hh_dbl[0]);
+    h1_imag = _SIMD_SET1(hh_dbl[1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
+        h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
+#endif
+#endif
+#ifdef HAVE_AVX512_XEON
+        h1_real = _SIMD_XOR(h1_real, sign);
+        h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+#endif /* VEC_SET != AVX_512 */
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128       
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
+#endif
+#endif /* VEC_SET == 128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+     h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
+
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
+     h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
+     h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
+#endif
+#endif
+#ifdef HAVE_AVX512_XEON
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+     h2_real = _SIMD_XOR(h2_real, sign);
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+#if VEC_SET != AVX_512
+     h1_real = _SIMD_XOR(h1_real, sign);
+     h1_imag = _SIMD_XOR(h1_imag, sign);
+     h2_real = _SIMD_XOR(h2_real, sign);
+     h2_imag = _SIMD_XOR(h2_imag, sign);
+#endif
+
+#if VEC_SET == SSE_128
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
+#else
+     tmp2 = _SIMD_LOADU(s_dbl);
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
+                          s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_COMPLEX
+     tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
+                          s_dbl[1], s_dbl[0],
+                          s_dbl[1], s_dbl[0],
+                          s_dbl[1], s_dbl[0]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
+#endif
+
+#endif /* VEC_SET == AVX_512 */
+
+     tmp1 = _SIMD_MUL(h2_imag, tmp2);
+#ifdef __ELPA_USE_FMA__
+     tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+#if VEC_SET == AVX_512
+     _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
+
+     h2_real = _SIMD_SET1(s_dbl[0]);
+     h2_imag = _SIMD_SET1(s_dbl[1]);
+#endif
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_movedup_pd(tmp2);
+     h2_imag = _mm_set1_pd(tmp2[1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(tmp2);
+     h2_imag = _mm_movehdup_ps(tmp2);
+#endif
+#endif /*  VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_SET1(tmp2[0]);
+     h2_imag = _SIMD_SET1(tmp2[1]);
+#endif /* VEC_SET == AVX_256 */
+
+     tmp1 = _SIMD_MUL(h1_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#else
+     y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
+#endif
+
+     tmp1 = _SIMD_MUL(h2_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+#endif /* BLOCK2 */
+
+     q1 = _SIMD_LOAD(&q_dbl[0]);
+
+#ifdef BLOCK1
+     q1 = _SIMD_ADD(q1, x1);
+#endif
+
+#ifdef BLOCK2
+     q1 = _SIMD_ADD(q1, y1);
+#endif
+     _SIMD_STORE(&q_dbl[0], q1);
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
+     h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
+     h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
+
+     q1 = _SIMD_ADD(q1, x1);
+
+     tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
+
+#endif /* BLOCK2 */
+
+     for (i = BLOCK; i < nb; i++)
+     {
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
+        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+        h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
+        h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+        q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
+
+        tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+        q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+        q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128
+#ifdef DOUBLE_PRECISION_COMPLEX
+        h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
+        h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+        h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
+        h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+	h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
+        h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+        h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
+        h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+        tmp1 = _SIMD_MUL(h2_imag, y1);
+#ifdef __ELPA_USE_FMA__
+        q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+        q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+#endif /* BLOCK2 */
+
+        _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
+    }
+#ifdef BLOCK2
+
+#if VEC_SET == SSE_128     
+#ifdef DOUBLE_PRECISION_COMPLEX
+     h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
+     h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
+     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == AVX_256
+     h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+     h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
+     h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
+#endif /* VEC_SET == AVX_512 */
+
+     q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
+
+     tmp1 = _SIMD_MUL(h1_imag, x1);
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
+#endif
+
+     _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
+
+#endif /* BLOCK2 */
+
+}
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_avx512_1hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_avx512_1hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_avx512_1hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_avx512_1hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK1 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef VEC_SET
+#undef BLOCK1
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_avx512_1hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_avx512_1hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_avx512_1hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_avx512_1hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK1 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef VEC_SET
+#undef BLOCK1
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_avx512_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_avx512_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_avx512_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_avx512_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef VEC_SET
+#undef BLOCK2
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_avx512_2hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_avx512_2hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_avx512_2hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_avx512_2hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef VEC_SET
+#undef BLOCK2
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK1 1
+#define VEC_SET AVX_256
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK1
+#undef VEC_SET
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK1 1
+#define VEC_SET AVX_256
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK1
+#undef VEC_SET
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#define VEC_SET AVX_256
+#define BLOCK2 1
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef VEC_SET
+#undef BLOCK2
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#define VEC_SET AVX_256
+#define BLOCK2 1
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef VEC_SET
+#undef BLOCK2
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex.F90 elpa-2019.11.001/src/elpa2/kernels/complex.F90
--- elpa-2016.05.001/src/elpa2/kernels/complex.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,89 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+! It should be compiled with the highest possible optimization level.
+!
+! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+
+#include "config-f90.h"
+
+#ifndef USE_ASSUMED_SIZE
+module complex_generic_kernel
+
+  private
+  public single_hh_trafo_complex_generic_double
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public single_hh_trafo_complex_generic_single
+#endif
+
+  contains
+#endif
+
+#define DOUBLE_PRECISION_COMPLEX 1
+#define COMPLEX_DATATYPE ck8
+#include "complex_template.F90"
+#undef DOUBLE_PRECISION_COMPLEX
+#undef COMPLEX_DATATYPE
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#undef DOUBLE_PRECISION_COMPLEX
+#define COMPLEX_DATATYPE ck4
+#include "complex_template.F90"
+#undef DOUBLE_PRECISION_COMPLEX
+#undef COMPLEX_DATATYPE
+#endif
+
+#ifndef USE_ASSUMED_SIZE
+end module complex_generic_kernel
+#endif
+
+! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_simple.F90 elpa-2019.11.001/src/elpa2/kernels/complex_simple.F90
--- elpa-2016.05.001/src/elpa2/kernels/complex_simple.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_simple.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,92 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! This is the small and simple version (no hand unrolling of loops etc.) but for some
+! compilers this performs better than a sophisticated version with transformed and unrolled loops.
+!
+! It should be compiled with the highest possible optimization level.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+
+#include "config-f90.h"
+
+#ifndef USE_ASSUMED_SIZE
+module complex_generic_simple_kernel
+
+  private
+  public single_hh_trafo_complex_generic_simple_double
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public single_hh_trafo_complex_generic_simple_single
+#endif
+
+  contains
+#endif
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "simple_template.F90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "simple_template.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#endif
+
+#ifndef USE_ASSUMED_SIZE
+end module complex_generic_simple_kernel
+#endif
+! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_sse_1hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_sse_1hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_sse_1hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_sse_1hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK1 1
+#define VEC_SET SSE_128
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK1
+#undef VEC_SET
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_sse_1hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_sse_1hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_sse_1hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_sse_1hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK1 1
+#define VEC_SET SSE_128
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK1
+#undef VEC_SET
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_sse_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_sse_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_sse_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_sse_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#define VEC_SET SSE_128
+#define BLOCK2 1
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef VEC_SET
+#undef BLOCK2
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_sse_2hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/complex_sse_2hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/complex_sse_2hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_sse_2hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#define VEC_SET SSE_128
+#define BLOCK2 1
+#include "../../general/precision_macros.h"
+#include "complex_128bit_256bit_512bit_BLOCK_template.c"
+#undef VEC_SET
+#undef BLOCK2
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/complex_template.F90 elpa-2019.11.001/src/elpa2/kernels/complex_template.F90
--- elpa-2016.05.001/src/elpa2/kernels/complex_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/complex_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1107 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+! It should be compiled with the highest possible optimization level.
+!
+! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+#endif
+
+  ! the Intel compiler creates a temp array copy of array q!
+  ! this should be prevented, if possible without using assumed size arrays
+#ifdef DOUBLE_PRECISION_COMPLEX
+  subroutine single_hh_trafo_complex_generic_double(q, hh, nb, nq, ldq)
+#else
+  subroutine single_hh_trafo_complex_generic_single(q, hh, nb, nq, ldq)
+#endif
+    use precision
+    use elpa_abstract_impl
+    implicit none
+
+ !   class(elpa_abstract_impl_t), intent(inout) :: obj
+    integer(kind=ik), intent(in)    :: nb, nq, ldq
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(*)
+#else
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(1:ldq,1:nb)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(1:nb)
+#endif
+
+    integer(kind=ik)                :: i
+
+!#ifdef DOUBLE_PRECISION_COMPLEX
+!
+!      call obj%timer%start("kernel generic: single_hh_trafo_complex_generic_double")
+!
+!#else
+!
+!      call obj%timer%start("kernel generic: single_hh_trafo_complex_generic_single")
+!
+!#endif
+    ! Safety only:
+
+    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
+
+    ! Do the Householder transformations
+
+    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
+
+    do i=1,nq-8,12
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_complex_kernel_12_double(q(i,1),hh, nb, ldq)
+#else
+       call hh_trafo_complex_kernel_12_double(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_complex_kernel_12_single(q(i,1),hh, nb, ldq)
+#else
+       call hh_trafo_complex_kernel_12_single(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
+#endif
+
+#endif
+    enddo
+
+    ! i > nq-8 now, i.e. at most 8 rows remain
+
+    if(nq-i+1 > 4) then
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_complex_kernel_8_double(q(i,1),hh, nb, ldq)
+#else
+       call hh_trafo_complex_kernel_8_double(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_complex_kernel_8_single(q(i,1),hh, nb, ldq)
+#else
+       call hh_trafo_complex_kernel_8_single(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
+#endif
+
+#endif
+    else if(nq-i+1 > 0) then
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_complex_kernel_4_double(q(i,1),hh, nb, ldq)
+#else
+       call hh_trafo_complex_kernel_4_double(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_complex_kernel_4_single(q(i,1),hh, nb, ldq)
+#else
+       call hh_trafo_complex_kernel_4_single(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
+#endif
+
+#endif
+    endif
+
+!#ifdef DOUBLE_PRECISION_COMPLEX
+!
+!      call obj%timer%stop("kernel generic: single_hh_trafo_complex_generic_double")
+!
+!#else
+!
+!      call obj%timer%stop("kernel generic: single_hh_trafo_complex_generic_single")
+!
+!#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  end subroutine single_hh_trafo_complex_generic_double
+#else
+  end subroutine single_hh_trafo_complex_generic_single
+#endif
+  ! --------------------------------------------------------------------------------------------------
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  subroutine double_hh_trafo_complex_generic_double(q, hh, nb, nq, ldq, ldh)
+#else
+  subroutine double_hh_trafo_complex_generic_single(q, hh, nb, nq, ldq, ldh)
+#endif
+    use precision
+    implicit none
+
+    integer(kind=ik), intent(in)    :: nb, nq, ldq, ldh
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(ldh,*)
+#else
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(1:ldq,1:nb+1)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(1:ldh,1:2)
+#endif
+    complex(kind=COMPLEX_DATATYPE)                :: s
+
+    integer(kind=ik)                 :: i
+
+    ! Safety only:
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+ !     call obj%timer%start("kernel generic: double_hh_trafo_complex_generic_double")
+
+#else
+
+  !    call obj%timer%start("kernel generic: double_hh_trafo_complex_generic_single")
+
+#endif
+    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
+
+    ! Calculate dot product of the two Householder vectors
+
+    s = conjg(hh(2,2)*1)
+    do i=3,nb
+       s = s+(conjg(hh(i,2))*hh(i-1,1))
+    enddo
+
+    ! Do the Householder transformations
+
+    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
+
+    do i=1,nq,4
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_complex_kernel_4_2hv_double(q(i,1),hh, nb, ldq, ldh, s)
+#else
+       call hh_trafo_complex_kernel_4_2hv_double(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_complex_kernel_4_2hv_single(q(i,1),hh, nb, ldq, ldh, s)
+#else
+       call hh_trafo_complex_kernel_4_2hv_single(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+#endif
+    enddo
+
+    !do i=1,nq-8,12
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+#ifdef USE_ASSUMED_SIZE
+    !   call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s)
+#else
+    !   call hh_trafo_complex_kernel_12_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+    !   call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s)
+#else
+    !   call hh_trafo_complex_kernel_12_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+#endif
+    !enddo
+
+    ! i > nq-8 now, i.e. at most 8 rows remain
+
+    !if(nq-i+1 > 4) then
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+#ifdef USE_ASSUMED_SIZE
+    !   call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s)
+#else
+    !   call hh_trafo_complex_kernel_8_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+    !   call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s)
+#else
+    !   call hh_trafo_complex_kernel_8_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+#endif
+    !else if(nq-i+1 > 0) then
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+#ifdef USE_ASSUMED_SIZE
+    !   call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#else
+
+#endif
+
+#else
+
+#ifdef USE_ASSUMED_SIZE
+    !   call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#else
+
+#endif
+
+#endif
+    !endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+ !     call obj%timer%stop("kernel generic: double_hh_trafo_complex_generic_double")
+
+#else
+
+  !    call obj%timer%stop("kernel generic: double_hh_trafo_complex_generic_single")
+
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  end subroutine double_hh_trafo_complex_generic_double
+#else
+  end subroutine double_hh_trafo_complex_generic_single
+#endif
+  ! --------------------------------------------------------------------------------------------------
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  subroutine hh_trafo_complex_kernel_12_double(q, hh, nb, ldq)
+#else
+  subroutine hh_trafo_complex_kernel_12_single(q, hh, nb, ldq)
+#endif
+    use precision
+    implicit none
+
+    integer(kind=ik), intent(in)    :: nb, ldq
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(*)
+#else
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(:,:)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(1:nb)
+#endif
+    complex(kind=COMPLEX_DATATYPE)                :: x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc
+    complex(kind=COMPLEX_DATATYPE)                :: h1, tau1
+    integer(kind=ik)                :: i
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%start("kernel generic: hh_trafo_complex_kernel_12_double")
+
+#else
+
+  !    call obj%timer%start("kernel generic: hh_trafo_complex_kernel_12_single")
+
+#endif
+    x1 = q(1,1)
+    x2 = q(2,1)
+    x3 = q(3,1)
+    x4 = q(4,1)
+    x5 = q(5,1)
+    x6 = q(6,1)
+    x7 = q(7,1)
+    x8 = q(8,1)
+    x9 = q(9,1)
+    xa = q(10,1)
+    xb = q(11,1)
+    xc = q(12,1)
+
+    !DEC$ VECTOR ALIGNED
+    do i=2,nb
+       h1 = conjg(hh(i))
+       x1 = x1 + q(1,i)*h1
+       x2 = x2 + q(2,i)*h1
+       x3 = x3 + q(3,i)*h1
+       x4 = x4 + q(4,i)*h1
+       x5 = x5 + q(5,i)*h1
+       x6 = x6 + q(6,i)*h1
+       x7 = x7 + q(7,i)*h1
+       x8 = x8 + q(8,i)*h1
+       x9 = x9 + q(9,i)*h1
+       xa = xa + q(10,i)*h1
+       xb = xb + q(11,i)*h1
+       xc = xc + q(12,i)*h1
+    enddo
+
+    tau1 = hh(1)
+
+    h1 = -tau1
+    x1 = x1*h1
+    x2 = x2*h1
+    x3 = x3*h1
+    x4 = x4*h1
+    x5 = x5*h1
+    x6 = x6*h1
+    x7 = x7*h1
+    x8 = x8*h1
+    x9 = x9*h1
+    xa = xa*h1
+    xb = xb*h1
+    xc = xc*h1
+
+    q(1,1) = q(1,1) + x1
+    q(2,1) = q(2,1) + x2
+    q(3,1) = q(3,1) + x3
+    q(4,1) = q(4,1) + x4
+    q(5,1) = q(5,1) + x5
+    q(6,1) = q(6,1) + x6
+    q(7,1) = q(7,1) + x7
+    q(8,1) = q(8,1) + x8
+    q(9,1) = q(9,1) + x9
+    q(10,1) = q(10,1) + xa
+    q(11,1) = q(11,1) + xb
+    q(12,1) = q(12,1) + xc
+
+    !DEC$ VECTOR ALIGNED
+    do i=2,nb
+       h1 = hh(i)
+       q(1,i) = q(1,i) + x1*h1
+       q(2,i) = q(2,i) + x2*h1
+       q(3,i) = q(3,i) + x3*h1
+       q(4,i) = q(4,i) + x4*h1
+       q(5,i) = q(5,i) + x5*h1
+       q(6,i) = q(6,i) + x6*h1
+       q(7,i) = q(7,i) + x7*h1
+       q(8,i) = q(8,i) + x8*h1
+       q(9,i) = q(9,i) + x9*h1
+       q(10,i) = q(10,i) + xa*h1
+       q(11,i) = q(11,i) + xb*h1
+       q(12,i) = q(12,i) + xc*h1
+    enddo
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_12_double")
+
+#else
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_12_single")
+
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  end subroutine hh_trafo_complex_kernel_12_double
+#else
+  end subroutine hh_trafo_complex_kernel_12_single
+#endif
+  ! --------------------------------------------------------------------------------------------------
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  subroutine hh_trafo_complex_kernel_8_double(q, hh, nb, ldq)
+#else
+  subroutine hh_trafo_complex_kernel_8_single(q, hh, nb, ldq)
+#endif
+    use precision
+    implicit none
+
+    integer(kind=ik), intent(in)    :: nb, ldq
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(*)
+#else
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(:,:)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(1:nb)
+#endif
+    complex(kind=COMPLEX_DATATYPE)                :: x1, x2, x3, x4, x5, x6, x7, x8
+    complex(kind=COMPLEX_DATATYPE)                :: h1, tau1
+    integer(kind=ik)                :: i
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%start("kernel generic: hh_trafo_complex_kernel_8_double")
+
+#else
+
+   !   call obj%timer%start("kernel generic: hh_trafo_complex_kernel_8_single")
+
+#endif
+    x1 = q(1,1)
+    x2 = q(2,1)
+    x3 = q(3,1)
+    x4 = q(4,1)
+    x5 = q(5,1)
+    x6 = q(6,1)
+    x7 = q(7,1)
+    x8 = q(8,1)
+
+    !DEC$ VECTOR ALIGNED
+    do i=2,nb
+       h1 = conjg(hh(i))
+       x1 = x1 + q(1,i)*h1
+       x2 = x2 + q(2,i)*h1
+       x3 = x3 + q(3,i)*h1
+       x4 = x4 + q(4,i)*h1
+       x5 = x5 + q(5,i)*h1
+       x6 = x6 + q(6,i)*h1
+       x7 = x7 + q(7,i)*h1
+       x8 = x8 + q(8,i)*h1
+    enddo
+
+    tau1 = hh(1)
+
+    h1 = -tau1
+    x1 = x1*h1
+    x2 = x2*h1
+    x3 = x3*h1
+    x4 = x4*h1
+    x5 = x5*h1
+    x6 = x6*h1
+    x7 = x7*h1
+    x8 = x8*h1
+
+    q(1,1) = q(1,1) + x1
+    q(2,1) = q(2,1) + x2
+    q(3,1) = q(3,1) + x3
+    q(4,1) = q(4,1) + x4
+    q(5,1) = q(5,1) + x5
+    q(6,1) = q(6,1) + x6
+    q(7,1) = q(7,1) + x7
+    q(8,1) = q(8,1) + x8
+
+    !DEC$ VECTOR ALIGNED
+    do i=2,nb
+       h1 = hh(i)
+       q(1,i) = q(1,i) + x1*h1
+       q(2,i) = q(2,i) + x2*h1
+       q(3,i) = q(3,i) + x3*h1
+       q(4,i) = q(4,i) + x4*h1
+       q(5,i) = q(5,i) + x5*h1
+       q(6,i) = q(6,i) + x6*h1
+       q(7,i) = q(7,i) + x7*h1
+       q(8,i) = q(8,i) + x8*h1
+    enddo
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+    !  call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_8_double")
+
+#else
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_8_single")
+
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  end subroutine hh_trafo_complex_kernel_8_double
+#else
+  end subroutine hh_trafo_complex_kernel_8_single
+#endif
+  ! --------------------------------------------------------------------------------------------------
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  subroutine hh_trafo_complex_kernel_4_double(q, hh, nb, ldq)
+#else
+  subroutine hh_trafo_complex_kernel_4_single(q, hh, nb, ldq)
+#endif
+    use precision
+    implicit none
+
+    integer(kind=ik), intent(in)    :: nb, ldq
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(*)
+#else
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(:,:)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(1:nb)
+#endif
+    complex(kind=COMPLEX_DATATYPE)                :: x1, x2, x3, x4
+    complex(kind=COMPLEX_DATATYPE)                :: h1, tau1
+    integer(kind=ik)                :: i
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+    !  call obj%timer%start("kernel generic: hh_trafo_complex_kernel_4_double")
+
+#else
+
+    !  call obj%timer%start("kernel generic: hh_trafo_complex_kernel_4_single")
+
+#endif
+
+    x1 = q(1,1)
+    x2 = q(2,1)
+    x3 = q(3,1)
+    x4 = q(4,1)
+
+    !DEC$ VECTOR ALIGNED
+    do i=2,nb
+       h1 = conjg(hh(i))
+       x1 = x1 + q(1,i)*h1
+       x2 = x2 + q(2,i)*h1
+       x3 = x3 + q(3,i)*h1
+       x4 = x4 + q(4,i)*h1
+    enddo
+
+    tau1 = hh(1)
+
+    h1 = -tau1
+    x1 = x1*h1
+    x2 = x2*h1
+    x3 = x3*h1
+    x4 = x4*h1
+
+    q(1,1) = q(1,1) + x1
+    q(2,1) = q(2,1) + x2
+    q(3,1) = q(3,1) + x3
+    q(4,1) = q(4,1) + x4
+
+    !DEC$ VECTOR ALIGNED
+    do i=2,nb
+       h1 = hh(i)
+       q(1,i) = q(1,i) + x1*h1
+       q(2,i) = q(2,i) + x2*h1
+       q(3,i) = q(3,i) + x3*h1
+       q(4,i) = q(4,i) + x4*h1
+    enddo
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_4_double")
+
+#else
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_4_single")
+
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  end subroutine hh_trafo_complex_kernel_4_double
+#else
+  end subroutine hh_trafo_complex_kernel_4_single
+#endif
+  ! --------------------------------------------------------------------------------------------------
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  subroutine hh_trafo_complex_kernel_4_2hv_double(q, hh, nb, ldq, ldh, s)
+#else
+  subroutine hh_trafo_complex_kernel_4_2hv_single(q, hh, nb, ldq, ldh, s)
+#endif
+    use precision
+    implicit none
+
+    integer(kind=ik), intent(in)    :: nb, ldq, ldh
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(ldh,*)
+#else
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(:,:)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(1:ldh,1:2)
+#endif
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: s
+
+    complex(kind=COMPLEX_DATATYPE)                :: x1, x2, x3, x4, y1, y2, y3, y4
+    complex(kind=COMPLEX_DATATYPE)                :: h1, h2, tau1, tau2
+    integer(kind=ik)                :: i
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%start("kernel generic: hh_trafo_complex_kernel_4_2hv_double")
+
+#else
+
+  !    call obj%timer%start("kernel generic: hh_trafo_complex_kernel_4_2hv_single")
+
+#endif
+    x1 = q(1,2)
+    x2 = q(2,2)
+    x3 = q(3,2)
+    x4 = q(4,2)
+
+    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
+    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
+    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
+    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
+
+    !DEC$ VECTOR ALIGNED
+    do i=3,nb
+       h1 = conjg(hh(i-1,1))
+       h2 = conjg(hh(i,2))
+       x1 = x1 + q(1,i)*h1
+       y1 = y1 + q(1,i)*h2
+       x2 = x2 + q(2,i)*h1
+       y2 = y2 + q(2,i)*h2
+       x3 = x3 + q(3,i)*h1
+       y3 = y3 + q(3,i)*h2
+       x4 = x4 + q(4,i)*h1
+       y4 = y4 + q(4,i)*h2
+    enddo
+
+    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
+    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
+    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
+    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
+
+    tau1 = hh(1,1)
+    tau2 = hh(1,2)
+
+    h1 = -tau1
+    x1 = x1*h1
+    x2 = x2*h1
+    x3 = x3*h1
+    x4 = x4*h1
+    h1 = -tau2
+    h2 = -tau2*s
+    y1 = y1*h1 + x1*h2
+    y2 = y2*h1 + x2*h2
+    y3 = y3*h1 + x3*h2
+    y4 = y4*h1 + x4*h2
+
+    q(1,1) = q(1,1) + y1
+    q(2,1) = q(2,1) + y2
+    q(3,1) = q(3,1) + y3
+    q(4,1) = q(4,1) + y4
+
+    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
+    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
+    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
+    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
+
+    !DEC$ VECTOR ALIGNED
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       q(1,i) = q(1,i) + x1*h1 + y1*h2
+       q(2,i) = q(2,i) + x2*h1 + y2*h2
+       q(3,i) = q(3,i) + x3*h1 + y3*h2
+       q(4,i) = q(4,i) + x4*h1 + y4*h2
+    enddo
+
+    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
+    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
+    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
+    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_4_2hv_double")
+
+#else
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_4_2hv_single")
+
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  end subroutine hh_trafo_complex_kernel_4_2hv_double
+#else
+  end subroutine hh_trafo_complex_kernel_4_2hv_single
+#endif
+
+  ! --------------------------------------------------------------------------------------------------
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  subroutine hh_trafo_complex_kernel_8_2hv_double(q, hh, nb, ldq, ldh, s)
+#else
+  subroutine hh_trafo_complex_kernel_8_2hv_single(q, hh, nb, ldq, ldh, s)
+#endif
+    use precision
+    implicit none
+
+    integer(kind=ik), intent(in)    :: nb, ldq, ldh
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(ldh,*)
+#else
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(:,:)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(1:ldh,1:2)
+#endif
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: s
+
+    complex(kind=COMPLEX_DATATYPE)                :: x1, x2, x3, x4, x5, x6 ,x7, x8, y1, y2, y3, y4, y5, y6, y7, y8
+    complex(kind=COMPLEX_DATATYPE)                :: h1, h2, tau1, tau2
+    integer(kind=ik)                :: i
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%start("kernel generic: hh_trafo_complex_kernel_8_2hv_double")
+
+#else
+
+   !   call obj%timer%start("kernel generic: hh_trafo_complex_kernel_8_2hv_single")
+
+#endif
+
+    x1 = q(1,2)
+    x2 = q(2,2)
+    x3 = q(3,2)
+    x4 = q(4,2)
+    x5 = q(5,2)
+    x6 = q(6,2)
+    x7 = q(7,2)
+    x8 = q(8,2)
+
+    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
+    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
+    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
+    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
+    y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
+    y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
+    y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
+    y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
+
+    !DEC$ VECTOR ALIGNED
+    do i=3,nb
+       h1 = conjg(hh(i-1,1))
+       h2 = conjg(hh(i,2))
+       x1 = x1 + q(1,i)*h1
+       y1 = y1 + q(1,i)*h2
+       x2 = x2 + q(2,i)*h1
+       y2 = y2 + q(2,i)*h2
+       x3 = x3 + q(3,i)*h1
+       y3 = y3 + q(3,i)*h2
+       x4 = x4 + q(4,i)*h1
+       y4 = y4 + q(4,i)*h2
+       x5 = x5 + q(5,i)*h1
+       y5 = y5 + q(5,i)*h2
+       x6 = x6 + q(6,i)*h1
+       y6 = y6 + q(6,i)*h2
+       x7 = x7 + q(7,i)*h1
+       y7 = y7 + q(7,i)*h2
+       x8 = x8 + q(8,i)*h1
+       y8 = y8 + q(8,i)*h2
+    enddo
+
+    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
+    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
+    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
+    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
+    x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
+    x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
+    x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
+    x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
+
+    tau1 = hh(1,1)
+    tau2 = hh(1,2)
+
+    h1 = -tau1
+    x1 = x1*h1
+    x2 = x2*h1
+    x3 = x3*h1
+    x4 = x4*h1
+    x5 = x5*h1
+    x6 = x6*h1
+    x7 = x7*h1
+    x8 = x8*h1
+
+    h1 = -tau2
+    h2 = -tau2*s
+    y1 = y1*h1 + x1*h2
+    y2 = y2*h1 + x2*h2
+    y3 = y3*h1 + x3*h2
+    y4 = y4*h1 + x4*h2
+    y5 = y5*h1 + x5*h2
+    y6 = y6*h1 + x6*h2
+    y7 = y7*h1 + x7*h2
+    y8 = y8*h1 + x8*h2
+
+    q(1,1) = q(1,1) + y1
+    q(2,1) = q(2,1) + y2
+    q(3,1) = q(3,1) + y3
+    q(4,1) = q(4,1) + y4
+    q(5,1) = q(5,1) + y5
+    q(6,1) = q(6,1) + y6
+    q(7,1) = q(7,1) + y7
+    q(8,1) = q(8,1) + y8
+
+    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
+    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
+    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
+    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
+    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
+    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
+    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
+    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
+
+    !DEC$ VECTOR ALIGNED
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       q(1,i) = q(1,i) + x1*h1 + y1*h2
+       q(2,i) = q(2,i) + x2*h1 + y2*h2
+       q(3,i) = q(3,i) + x3*h1 + y3*h2
+       q(4,i) = q(4,i) + x4*h1 + y4*h2
+       q(5,i) = q(5,i) + x5*h1 + y5*h2
+       q(6,i) = q(6,i) + x6*h1 + y6*h2
+       q(7,i) = q(7,i) + x7*h1 + y7*h2
+       q(8,i) = q(8,i) + x8*h1 + y8*h2
+    enddo
+
+    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
+    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
+    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
+    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
+    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
+    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
+    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
+    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_8_2hv_double")
+
+#else
+
+    !  call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_8_2hv_single")
+
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  end subroutine hh_trafo_complex_kernel_8_2hv_double
+#else
+  end subroutine hh_trafo_complex_kernel_8_2hv_single
+#endif
+  ! --------------------------------------------------------------------------------------------------
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  subroutine hh_trafo_complex_kernel_12_2hv_double(q, hh, nb, ldq, ldh, s)
+#else
+  subroutine hh_trafo_complex_kernel_12_2hv_single(q, hh, nb, ldq, ldh, s)
+#endif
+    use precision
+    implicit none
+
+    integer(kind=ik), intent(in)    :: nb, ldq, ldh
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(ldh,*)
+#else
+    complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(:,:)
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: hh(1:ldh,1:2)
+#endif
+    complex(kind=COMPLEX_DATATYPE), intent(in)    :: s
+
+    complex(kind=COMPLEX_DATATYPE)                :: x1, x2, x3, x4, x5, x6 ,x7, x8, x9, x10, x11, x12, y1, y2, y3, y4, y5, y6, &
+                                       y7, y8, y9, y10, y11, y12
+    complex(kind=COMPLEX_DATATYPE)                :: h1, h2, tau1, tau2
+    integer(kind=ik)                :: i
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%start("kernel generic: hh_trafo_complex_kernel_12_2hv_double")
+
+#else
+
+  !    call obj%timer%start("kernel generic: hh_trafo_complex_kernel_12_2hv_single")
+
+#endif
+    x1 = q(1,2)
+    x2 = q(2,2)
+    x3 = q(3,2)
+    x4 = q(4,2)
+    x5 = q(5,2)
+    x6 = q(6,2)
+    x7 = q(7,2)
+    x8 = q(8,2)
+    x9 = q(9,2)
+    x10 = q(10,2)
+    x11 = q(11,2)
+    x12 = q(12,2)
+
+    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
+    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
+    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
+    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
+    y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
+    y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
+    y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
+    y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
+    y9 = q(9,1) + q(9,2)*conjg(hh(2,2))
+    y10 = q(10,1) + q(10,2)*conjg(hh(2,2))
+    y11 = q(11,1) + q(11,2)*conjg(hh(2,2))
+    y12 = q(12,1) + q(12,2)*conjg(hh(2,2))
+
+    !DEC$ VECTOR ALIGNED
+    do i=3,nb
+       h1 = conjg(hh(i-1,1))
+       h2 = conjg(hh(i,2))
+       x1 = x1 + q(1,i)*h1
+       y1 = y1 + q(1,i)*h2
+       x2 = x2 + q(2,i)*h1
+       y2 = y2 + q(2,i)*h2
+       x3 = x3 + q(3,i)*h1
+       y3 = y3 + q(3,i)*h2
+       x4 = x4 + q(4,i)*h1
+       y4 = y4 + q(4,i)*h2
+       x5 = x5 + q(5,i)*h1
+       y5 = y5 + q(5,i)*h2
+       x6 = x6 + q(6,i)*h1
+       y6 = y6 + q(6,i)*h2
+       x7 = x7 + q(7,i)*h1
+       y7 = y7 + q(7,i)*h2
+       x8 = x8 + q(8,i)*h1
+       y8 = y8 + q(8,i)*h2
+       x9 = x9 + q(9,i)*h1
+       y9 = y9 + q(9,i)*h2
+       x10 = x10 + q(10,i)*h1
+       y10 = y10 + q(10,i)*h2
+       x11 = x11 + q(11,i)*h1
+       y11 = y11 + q(11,i)*h2
+       x12 = x12 + q(12,i)*h1
+       y12 = y12 + q(12,i)*h2
+    enddo
+
+    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
+    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
+    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
+    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
+    x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
+    x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
+    x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
+    x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
+    x9 = x9 + q(9,nb+1)*conjg(hh(nb,1))
+    x10 = x10 + q(10,nb+1)*conjg(hh(nb,1))
+    x11 = x11 + q(11,nb+1)*conjg(hh(nb,1))
+    x12 = x12 + q(12,nb+1)*conjg(hh(nb,1))
+
+    tau1 = hh(1,1)
+    tau2 = hh(1,2)
+
+    h1 = -tau1
+    x1 = x1*h1
+    x2 = x2*h1
+    x3 = x3*h1
+    x4 = x4*h1
+    x5 = x5*h1
+    x6 = x6*h1
+    x7 = x7*h1
+    x8 = x8*h1
+    x9 = x9*h1
+    x10 = x10*h1
+    x11 = x11*h1
+    x12 = x12*h1
+    h1 = -tau2
+    h2 = -tau2*s
+    y1 = y1*h1 + x1*h2
+    y2 = y2*h1 + x2*h2
+    y3 = y3*h1 + x3*h2
+    y4 = y4*h1 + x4*h2
+    y5 = y5*h1 + x5*h2
+    y6 = y6*h1 + x6*h2
+    y7 = y7*h1 + x7*h2
+    y8 = y8*h1 + x8*h2
+    y9 = y9*h1 + x9*h2
+    y10 = y10*h1 + x10*h2
+    y11 = y11*h1 + x11*h2
+    y12 = y12*h1 + x12*h2
+
+    q(1,1) = q(1,1) + y1
+    q(2,1) = q(2,1) + y2
+    q(3,1) = q(3,1) + y3
+    q(4,1) = q(4,1) + y4
+    q(5,1) = q(5,1) + y5
+    q(6,1) = q(6,1) + y6
+    q(7,1) = q(7,1) + y7
+    q(8,1) = q(8,1) + y8
+    q(9,1) = q(9,1) + y9
+    q(10,1) = q(10,1) + y10
+    q(11,1) = q(11,1) + y11
+    q(12,1) = q(12,1) + y12
+
+    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
+    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
+    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
+    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
+    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
+    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
+    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
+    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
+    q(9,2) = q(9,2) + x9 + y9*hh(2,2)
+    q(10,2) = q(10,2) + x10 + y10*hh(2,2)
+    q(11,2) = q(11,2) + x11 + y11*hh(2,2)
+    q(12,2) = q(12,2) + x12 + y12*hh(2,2)
+
+    !DEC$ VECTOR ALIGNED
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       q(1,i) = q(1,i) + x1*h1 + y1*h2
+       q(2,i) = q(2,i) + x2*h1 + y2*h2
+       q(3,i) = q(3,i) + x3*h1 + y3*h2
+       q(4,i) = q(4,i) + x4*h1 + y4*h2
+       q(5,i) = q(5,i) + x5*h1 + y5*h2
+       q(6,i) = q(6,i) + x6*h1 + y6*h2
+       q(7,i) = q(7,i) + x7*h1 + y7*h2
+       q(8,i) = q(8,i) + x8*h1 + y8*h2
+       q(9,i) = q(9,i) + x9*h1 + y9*h2
+       q(10,i) = q(10,i) + x10*h1 + y10*h2
+       q(11,i) = q(11,i) + x11*h1 + y11*h2
+       q(12,i) = q(12,i) + x12*h1 + y12*h2
+    enddo
+
+    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
+    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
+    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
+    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
+    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
+    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
+    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
+    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
+    q(9,nb+1) = q(9,nb+1) + x9*hh(nb,1)
+    q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
+    q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
+    q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_12_2hv_double")
+
+#else
+
+   !   call obj%timer%stop("kernel generic: hh_trafo_complex_kernel_12_2hv_single")
+
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+  end subroutine hh_trafo_complex_kernel_12_2hv_double
+#else
+  end subroutine hh_trafo_complex_kernel_12_2hv_single
+#endif
diff -Nru elpa-2016.05.001/src/elpa2/kernels/mod_single_hh_trafo_real.F90 elpa-2019.11.001/src/elpa2/kernels/mod_single_hh_trafo_real.F90
--- elpa-2016.05.001/src/elpa2/kernels/mod_single_hh_trafo_real.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/mod_single_hh_trafo_real.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,128 @@
+module single_hh_trafo_real
+  implicit none
+#include "config-f90.h"
+
+#ifdef WITH_OPENMP
+  public single_hh_trafo_real_cpu_openmp_double
+#else
+  public single_hh_trafo_real_cpu_double
+#endif
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+
+#ifdef WITH_OPENMP
+  public single_hh_trafo_real_cpu_openmp_single
+#else
+  public single_hh_trafo_real_cpu_single
+#endif
+
+#endif
+
+  contains
+
+#ifdef WITH_OPENMP
+    subroutine single_hh_trafo_real_cpu_openmp_double(q, hh, nb, nq, ldq)
+#else
+    subroutine single_hh_trafo_real_cpu_double(q, hh, nb, nq, ldq)
+#endif
+
+      use elpa_abstract_impl
+      use precision
+      ! Perform single real Householder transformation.
+      ! This routine is not performance critical and thus it is coded here in Fortran
+
+      implicit none
+ !     class(elpa_abstract_impl_t), intent(inout) :: obj
+
+      integer(kind=ik), intent(in)   :: nb, nq, ldq
+!      real(kind=rk8), intent(inout)   :: q(ldq, *)
+!      real(kind=rk8), intent(in)      :: hh(*)
+      real(kind=rk8), intent(inout)   :: q(1:ldq, 1:nb)
+      real(kind=rk8), intent(in)      :: hh(1:nb)
+      integer(kind=ik)               :: i
+      real(kind=rk8)                  :: v(nq)
+
+!#ifdef WITH_OPENMP
+!      call obj%timer%start("single_hh_trafo_real_cpu_openmp_double")
+!#else
+!      call obj%timer%start("single_hh_trafo_real_cpu_double")
+!#endif
+
+      ! v = q * hh
+      v(:) = q(1:nq,1)
+      do i=2,nb
+        v(:) = v(:) + q(1:nq,i) * hh(i)
+      enddo
+
+      ! v = v * tau
+      v(:) = v(:) * hh(1)
+
+      ! q = q - v * hh**T
+      q(1:nq,1) = q(1:nq,1) - v(:)
+      do i=2,nb
+        q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
+      enddo
+
+!#ifdef WITH_OPENMP
+!      call obj%timer%stop("single_hh_trafo_real_cpu_openmp_double")
+!#else
+!      call obj%timer%stop("single_hh_trafo_real_cpu_double")
+!#endif
+    end subroutine
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+! single precision implementation at the moment duplicated !!!
+
+#ifdef WITH_OPENMP
+    subroutine single_hh_trafo_real_cpu_openmp_single(q, hh, nb, nq, ldq)
+#else
+    subroutine single_hh_trafo_real_cpu_single(q, hh, nb, nq, ldq)
+#endif
+
+      use elpa_abstract_impl
+      use precision
+      ! Perform single real Householder transformation.
+      ! This routine is not performance critical and thus it is coded here in Fortran
+
+      implicit none
+      !class(elpa_abstract_impl_t), intent(inout) :: obj
+
+      integer(kind=ik), intent(in)   :: nb, nq, ldq
+!      real(kind=rk4), intent(inout)   :: q(ldq, *)
+!      real(kind=rk4), intent(in)      :: hh(*)
+      real(kind=rk4), intent(inout)   :: q(1:ldq, 1:nb)
+      real(kind=rk4), intent(in)      :: hh(1:nb)
+      integer(kind=ik)               :: i
+      real(kind=rk4)                  :: v(nq)
+
+!#ifdef WITH_OPENMP
+!      call obj%timer%start("single_hh_trafo_real_cpu_openmp_single")
+!#else
+!      call obj%timer%start("single_hh_trafo_real_cpu_single")
+!#endif
+
+      ! v = q * hh
+      v(:) = q(1:nq,1)
+      do i=2,nb
+        v(:) = v(:) + q(1:nq,i) * hh(i)
+      enddo
+
+      ! v = v * tau
+      v(:) = v(:) * hh(1)
+
+      ! q = q - v * hh**T
+      q(1:nq,1) = q(1:nq,1) - v(:)
+      do i=2,nb
+        q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
+      enddo
+
+!#ifdef WITH_OPENMP
+!      call obj%timer%stop("single_hh_trafo_real_cpu_openmp_single")
+!#else
+!      call obj%timer%stop("single_hh_trafo_real_cpu_single")
+!#endif
+    end subroutine
+
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+end module
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c elpa-2019.11.001/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c
--- elpa-2016.05.001/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c	2019-12-20 05:57:47.000000000 +0000
@@ -0,0 +1,17367 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA. If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
+//
+#include "config-f90.h"
+
+#define CONCAT_8ARGS(a, b, c, d, e, f, g, h) CONCAT2_8ARGS(a, b, c, d, e, f, g, h)
+#define CONCAT2_8ARGS(a, b, c, d, e, f, g, h) a ## b ## c ## d ## e ## f ## g ## h
+
+#define CONCAT_7ARGS(a, b, c, d, e, f, g) CONCAT2_7ARGS(a, b, c, d, e, f, g)
+#define CONCAT2_7ARGS(a, b, c, d, e, f, g) a ## b ## c ## d ## e ## f ## g
+
+#define CONCAT_6ARGS(a, b, c, d, e, f) CONCAT2_6ARGS(a, b, c, d, e, f)
+#define CONCAT2_6ARGS(a, b, c, d, e, f) a ## b ## c ## d ## e ## f
+
+#define CONCAT_5ARGS(a, b, c, d, e) CONCAT2_5ARGS(a, b, c, d, e)
+#define CONCAT2_5ARGS(a, b, c, d, e) a ## b ## c ## d ## e
+
+#define CONCAT_4ARGS(a, b, c, d) CONCAT2_4ARGS(a, b, c, d)
+#define CONCAT2_4ARGS(a, b, c, d) a ## b ## c ## d
+
+#define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c)
+#define CONCAT2_3ARGS(a, b, c) a ## b ## c
+
+//define instruction set numbers
+#define SSE_128 128
+#define SPARC64_SSE 1281
+#define VSX_SSE 1282
+#define NEON_ARCH64_128 1285
+#define AVX_256 256
+#define AVX_512 512
+
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
+#include <x86intrin.h>
+#endif
+
+#if VEC_SET == SPARC64_SSE
+#include <fjmfunc.h>
+#include <emmintrin.h>
+#endif
+
+#if VEC_SET == VSX_SSE
+#include <altivec.h>
+#endif
+
+#if VEC_SET == NEON_ARCH64_128
+#include <arm_neon.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef BLOCK6
+#define PREFIX hexa
+#define BLOCK 6
+#endif
+
+#ifdef BLOCK4
+#define PREFIX quad
+#define BLOCK 4
+#endif
+
+#ifdef BLOCK2
+#define PREFIX double
+#define BLOCK 2
+#endif
+
+#if VEC_SET == SSE_128
+#define SIMD_SET SSE
+#endif
+
+#if VEC_SET == SPARC64_SSE
+#define SIMD_SET SPARC64
+#endif
+
+#if VEC_SET == VSX_SSE
+#define SIMD_SET VSX
+#endif
+
+#if VEC_SET == NEON_ARCH64_128
+#define SIMD_SET NEON_ARCH64
+#endif
+
+#if VEC_SET == AVX_256
+#define SIMD_SET AVX_AVX2
+#endif
+
+#if VEC_SET == AVX_512
+#define SIMD_SET AVX512
+#endif
+
+#define __forceinline __attribute__((always_inline)) static
+
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE
+#ifdef DOUBLE_PRECISION_REAL
+#define offset 2
+#define __SIMD_DATATYPE __m128d
+#define _SIMD_LOAD _mm_load_pd
+#define _SIMD_STORE _mm_store_pd
+#define _SIMD_ADD _mm_add_pd
+#define _SIMD_MUL _mm_mul_pd
+#define _SIMD_SUB _mm_sub_pd
+#define _SIMD_XOR _mm_xor_pd
+#if VEC_SET == SSE_128
+#define _SIMD_SET _mm_set_pd
+#define _SIMD_SET1 _mm_set1_pd
+#define _SIMD_NEG 1
+#endif
+#if VEC_SET == SPARC64_SSE
+#define _SIMD_NEG _fjsp_neg_v2r8
+#endif
+#endif /* DOUBLE_PRECISION_REAL */
+#ifdef SINGLE_PRECISION_REAL
+#define offset 4
+#define __SIMD_DATATYPE __m128
+#define _SIMD_LOAD _mm_load_ps
+#define _SIMD_STORE _mm_store_ps
+#define _SIMD_ADD _mm_add_ps
+#define _SIMD_MUL _mm_mul_ps
+#define _SIMD_SUB _mm_sub_ps
+#define _SIMD_XOR _mm_xor_ps
+#if VEC_SET == SSE_128
+#define _SIMD_SET _mm_set_ps
+#define _SIMD_SET1 _mm_set1_ps
+#define _SIMD_NEG 1
+#endif 
+#if VEC_SET == SPARC64_SSE
+#define _SIMD_NEG 1
+#endif
+#endif /* SINGLE_PRECISION_REAL */
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE */
+
+#if VEC_SET == VSX_SSE
+
+#ifdef DOUBLE_PRECISION_REAL
+#define offset 2
+#define __SIMD_DATATYPE __vector double
+#define __SIMD_LOAD (__vector double) vec_ld
+#endif
+
+#ifdef SINGLE_PRECISION_REAL
+#define offset 4
+#define __SIMD_DATATYPE __vector float
+#define _SIMD_LOAD  (__vector float) vec_ld
+#endif
+
+#define _SIMD_NEG 1
+#define _SIMD_STORE vec_st
+#define _SIMD_ADD vec_add
+#define _SIMD_MUL vec_mul
+#define _SIMD_SET1 vec_splats
+
+#endif /*  VEC_SET == SPARC64_SSE */
+
+#if VEC_SET == NEON_ARCH64_128
+#define __ELPA_USE_FMA__
+#ifdef DOUBLE_PRECISION_REAL
+#define offset 2
+#define __SIMD_DATATYPE __Float64x2_t
+#define _SIMD_LOAD vld1q_f64
+#define _SIMD_STORE vst1q_f64
+#define _SIMD_ADD vaddq_f64
+#define _SIMD_MUL vmulq_f64
+#define _SIMD_SUB vsubq_f64
+#define _SIMD_NEG vnegq_f64
+#define _SIMD_FMA(a, b, c) vfmaq_f64(c ,b, a)
+#define _SIMD_NFMA(a, b, c) vnegq_f64(vfmaq_f64(c ,b, a))
+#define _SIMD_FMSUB(a, b, c) vfmsq_f64(c, b, a)
+//#define _SIMD_XOR _mm_xor_pd
+#define _SIMD_SET1 vdupq_n_f64
+#endif /* DOUBLE_PRECISION_REAL */
+#ifdef SINGLE_PRECISION_REAL
+#define offset 4
+#define __SIMD_DATATYPE __Float32x4_t
+#define _SIMD_LOAD vld1q_f32
+#define _SIMD_STORE vst1q_f32
+#define _SIMD_ADD vaddq_f32
+#define _SIMD_MUL vmulq_f32
+#define _SIMD_SUB vsubq_f32
+#define _SIMD_NEG vnegq_f32
+#define _SIMD_FMA(a, b, c) vfmaq_f32(c ,b, a)
+#define _SIMD_NFMA(a, b, c) vnegq_f32(vfmaq_f32(c ,b, a))
+#define _SIMD_FMSUB(a, b, c) vfmsq_f32(c, b, a)
+//#define _SIMD_XOR _mm_xor_ps
+#define _SIMD_SET1 vdupq_n_f32
+#endif /* SINGLE_PRECISION_REAL */
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define offset 4
+#define __SIMD_DATATYPE __m256d
+#define _SIMD_LOAD _mm256_load_pd
+#define _SIMD_STORE _mm256_store_pd
+#define _SIMD_ADD _mm256_add_pd
+#define _SIMD_MUL _mm256_mul_pd
+#define _SIMD_SUB _mm256_sub_pd
+#define _SIMD_SET1 _mm256_set1_pd
+#define _SIMD_XOR _mm256_xor_pd
+#define _SIMD_BROADCAST _mm256_broadcast_sd
+#define _SIMD_NEG 1
+#ifdef HAVE_AVX2
+#ifdef __FMA4__
+#define __ELPA_USE_FMA__
+#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
+#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
+#error "This should be prop _mm256_msub_pd instead of _mm256_msub"
+#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
+#endif /* __FMA4__ */
+#ifdef __AVX2__
+#define __ELPA_USE_FMA__
+#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
+#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
+#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
+#endif /* __AVX2__ */
+#ifdef __ELPA_USE_FMA__
+#define _SIMD_FMA _mm256_FMA_pd
+#define _SIMD_NFMA _mm256_NFMA_pd
+#define _SIMD_FMSUB _mm256_FMSUB_pd
+#endif
+#endif /* HAVE_AVX2 */
+#endif /* DOUBLE_PRECISION_REAL */
+
+#ifdef SINGLE_PRECISION_REAL
+#define offset 8
+#define __SIMD_DATATYPE __m256
+#define _SIMD_LOAD _mm256_load_ps
+#define _SIMD_STORE _mm256_store_ps
+#define _SIMD_ADD _mm256_add_ps
+#define _SIMD_MUL _mm256_mul_ps
+#define _SIMD_SUB _mm256_sub_ps
+#define _SIMD_SET1 _mm256_set1_ps
+#define _SIMD_XOR _mm256_xor_ps
+#define _SIMD_BROADCAST _mm256_broadcast_ss
+#define _SIMD_NEG 1
+#ifdef HAVE_AVX2
+#ifdef __FMA4__
+#define __ELPA_USE_FMA__
+#define _mm256_FMA_ps(a,b,c) _mm256_macc_ps(a,b,c)
+#define _mm256_NFMA_ps(a,b,c) _mm256_nmacc_ps(a,b,c)
+#error "This should be prop _mm256_msub_ps instead of _mm256_msub"
+#define _mm256_FMSUB_ps(a,b,c) _mm256_msub(a,b,c)
+#endif /* __FMA4__ */
+#ifdef __AVX2__
+#define __ELPA_USE_FMA__
+#define _mm256_FMA_ps(a,b,c) _mm256_fmadd_ps(a,b,c)
+#define _mm256_NFMA_ps(a,b,c) _mm256_fnmadd_ps(a,b,c)
+#define _mm256_FMSUB_ps(a,b,c) _mm256_fmsub_ps(a,b,c)
+#endif /* __AVX2__ */
+#ifdef __ELPA_USE_FMA__
+#define _SIMD_FMA _mm256_FMA_ps
+#define _SIMD_NFMA _mm256_NFMA_ps
+#define _SIMD_FMSUB _mm256_FMSUB_ps
+#endif
+#endif /* HAVE_AVX2 */
+#endif /* SINGLE_PRECISION_REAL */
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define offset 8
+#define __SIMD_DATATYPE __m512d
+#define __SIMD_INTEGER  __m512i
+#define _SIMD_LOAD _mm512_load_pd
+#define _SIMD_STORE _mm512_store_pd
+#define _SIMD_ADD _mm512_add_pd
+#define _SIMD_MUL _mm512_mul_pd
+#define _SIMD_SUB _mm512_sub_pd
+#define _SIMD_SET1 _mm512_set1_pd
+#define _SIMD_NEG 1
+#ifdef HAVE_AVX512_XEON
+#define _SIMD_XOR _mm512_xor_pd
+#endif
+#ifdef HAVE_AVX512
+#define __ELPA_USE_FMA__
+#define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
+#define _mm512_NFMA_pd(a,b,c) _mm512_fnmadd_pd(a,b,c)
+#define _mm512_FMSUB_pd(a,b,c) _mm512_fmsub_pd(a,b,c)
+#ifdef __ELPA_USE_FMA__
+#define _SIMD_FMA _mm512_FMA_pd
+#define _SIMD_NFMA _mm512_NFMA_pd
+#define _SIMD_FMSUB _mm512_FMSUB_pd
+#endif
+#endif /* HAVE_AVX512 */
+#endif /* DOUBLE_PRECISION_REAL */
+
+#ifdef SINGLE_PRECISION_REAL
+#define offset 16
+#define __SIMD_DATATYPE __m512
+#define __SIMD_INTEGER  __m512i
+#define _SIMD_LOAD _mm512_load_ps
+#define _SIMD_STORE _mm512_store_ps
+#define _SIMD_ADD _mm512_add_ps
+#define _SIMD_MUL _mm512_mul_ps
+#define _SIMD_SUB _mm512_sub_ps
+#define _SIMD_SET1 _mm512_set1_ps
+#define _SIMD_NEG 1
+#ifdef HAVE_AVX512_XEON
+#define _SIMD_XOR _mm512_xor_ps
+#endif
+#ifdef HAVE_AVX512
+#define __ELPA_USE_FMA__
+#define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
+#define _mm512_NFMA_ps(a,b,c) _mm512_fnmadd_ps(a,b,c)
+#define _mm512_FMSUB_ps(a,b,c) _mm512_fmsub_ps(a,b,c)
+#ifdef __ELPA_USE_FMA__
+#define _SIMD_FMA _mm512_FMA_ps
+#define _SIMD_NFMA _mm512_NFMA_ps
+#define _SIMD_FMSUB _mm512_FMSUB_ps
+#endif
+#endif /* HAVE_AVX512 */
+#endif /* SINGLE_PRECISION_REAL */
+#endif /* VEC_SET == AVX_512 */
+
+#ifdef DOUBLE_PRECISION_REAL
+#define WORD_LENGTH double
+#define DATA_TYPE double
+#define DATA_TYPE_PTR double*
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define WORD_LENGTH single
+#define DATA_TYPE float
+#define DATA_TYPE_PTR float*
+#endif
+
+#if VEC_SET == SSE_128
+#undef __AVX__
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
+#undef _LOAD
+#undef _STORE
+#undef _XOR
+#define _LOAD(x) _SIMD_LOAD(x)
+#define _STORE(a, b) _SIMD_STORE(a, b)
+#define _XOR(a, b) _SIMD_XOR(a, b)
+#endif
+
+#if VEC_SET == VSX_SSE
+#undef _LOAD
+#undef _STORE
+#undef _XOR
+#define _LOAD(x) _SIMD_LOAD(0, (unsigned long int *) x)
+#define _STORE(a, b) _SIMD_STORE((__vector unsigned int) b, 0, (unsigned int *) a)
+#define _XOR(a, b) vec_mul(b, a)
+#endif
+
+
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE ||  VEC_SET == NEON_ARCH64_128
+//Forward declaration
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 4
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_512 */
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh, 
+#ifdef BLOCK2
+	DATA_TYPE s);
+#endif
+#ifdef BLOCK4
+	DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4);
+#endif
+#ifdef BLOCK6
+	DATA_TYPE_PTR scalarprods);
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_512 */
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh, 
+#ifdef BLOCK2
+	DATA_TYPE s);
+#endif
+#ifdef BLOCK4
+	DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4);
+#endif
+#ifdef BLOCK6
+	DATA_TYPE_PTR scalarprods);
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 12
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128  */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 48
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh,
+#ifdef BLOCK2
+	DATA_TYPE s);
+#endif
+#ifdef BLOCK4
+	DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4);
+#endif
+#ifdef BLOCK6
+	DATA_TYPE_PTR scalarprods);
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128  */
+
+#if VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 32
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 64
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh, 
+#ifdef BLOCK2
+	DATA_TYPE s);
+#endif
+#ifdef BLOCK4
+	DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4);
+#endif
+#ifdef BLOCK6
+	DATA_TYPE_PTR scalarprods);
+#endif
+
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 10
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 20
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 20
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 40
+#endif
+#endif /*  VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 40
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 80
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh, 
+#ifdef BLOCK2
+	DATA_TYPE s);
+#endif
+#ifdef BLOCK4
+	DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4);
+#endif
+#ifdef BLOCK6
+	DATA_TYPE_PTR scalarprods);
+#endif
+
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 48
+#endif
+#endif /*  VEC_SET == AVX_256 */
+
+#if VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 48
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 96
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh,
+#ifdef BLOCK2
+	DATA_TYPE s);
+#endif
+#ifdef BLOCK4
+	DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4);
+#endif
+#ifdef BLOCK6
+	DATA_TYPE_PTR scalarprods);
+#endif
+
+void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq, int* pldh);
+
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine double_hh_trafo_real_SSE_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="double_hh_trafo_real_SSE_2hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine double_hh_trafo_real_SSE_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="double_hh_trafo_real_SSE_2hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_float)  :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_SPARC64_SSE
+!f> interface
+!f>   subroutine double_hh_trafo_real_SPARC64_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="double_hh_trafo_real_SPARC64_2hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_double) :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_SPARC64_SSE
+!f> interface
+!f>   subroutine double_hh_trafo_real_SPARC64_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="double_hh_trafo_real_SPARC64_2hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_float)  :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_NEON_ARCH64_SSE
+!f> interface
+!f>   subroutine double_hh_trafo_real_NEON_ARCH64_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="double_hh_trafo_real_NEON_ARCH64_2hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_double) :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_NEON_ARCH64_SSE
+!f> interface
+!f>   subroutine double_hh_trafo_real_NEON_ARCH64_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="double_hh_trafo_real_NEON_ARCH64_2hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_float)  :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+
+/*
+!f>#ifdef HAVE_VSX_SSE
+!f> interface
+!f>   subroutine double_hh_trafo_real_VSX_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="double_hh_trafo_real_VSX_2hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value  :: q
+!f>        real(kind=c_double) :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_VSX_SSE
+!f> interface
+!f>   subroutine double_hh_trafo_real_VSX_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="double_hh_trafo_real_VSX_2hv_single")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_float)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine double_hh_trafo_real_AVX_AVX2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="double_hh_trafo_real_AVX_AVX2_2hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine double_hh_trafo_real_AVX_AVX2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="double_hh_trafo_real_AVX_AVX2_2hv_single")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)       :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_float)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine double_hh_trafo_real_AVX512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="double_hh_trafo_real_AVX512_2hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value      :: q
+!f>     real(kind=c_double)     :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine double_hh_trafo_real_AVX512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="double_hh_trafo_real_AVX512_2hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value      :: q
+!f>     real(kind=c_float)      :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine quad_hh_trafo_real_SSE_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="quad_hh_trafo_real_SSE_4hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine quad_hh_trafo_real_SSE_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="quad_hh_trafo_real_SSE_4hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_float)  :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_SPARC64_SSE
+!f> interface
+!f>   subroutine quad_hh_trafo_real_SPARC64_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="quad_hh_trafo_real_SPARC64_4hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_double) :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_SPARC64_SSE
+!f> interface
+!f>   subroutine quad_hh_trafo_real_SPARC64_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="quad_hh_trafo_real_SPARC64_4hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_float)  :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_NEON_ARCH64_SSE
+!f> interface
+!f>   subroutine quad_hh_trafo_real_NEON_ARCH64_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="quad_hh_trafo_real_NEON_ARCH64_4hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_double) :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_NEON_ARCH64_SSE
+!f> interface
+!f>   subroutine quad_hh_trafo_real_NEON_ARCH64_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="quad_hh_trafo_real_NEON_ARCH64_4hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_float)  :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_VSX_SSE
+!f> interface
+!f>   subroutine quad_hh_trafo_real_VSX_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="quad_hh_trafo_real_VSX_4hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_VSX_SSE
+!f> interface
+!f>   subroutine quad_hh_trafo_real_VSX_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="quad_hh_trafo_real_VSX_4hv_single")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_float)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine quad_hh_trafo_real_AVX_AVX2_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="quad_hh_trafo_real_AVX_AVX2_4hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine quad_hh_trafo_real_AVX_AVX2_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>              bind(C, name="quad_hh_trafo_real_AVX_AVX2_4hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int) :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value  :: q
+!f>     real(kind=c_float)  :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine quad_hh_trafo_real_AVX512_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="quad_hh_trafo_real_AVX512_4hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value      :: q
+!f>     real(kind=c_double)     :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine quad_hh_trafo_real_AVX512_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="quad_hh_trafo_real_AVX512_4hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value      :: q
+!f>     real(kind=c_float)      :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_SSE_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="hexa_hh_trafo_real_SSE_6hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#ifdef HAVE_SPARC64_SSE
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_SPARC64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="hexa_hh_trafo_real_SPARC64_6hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_NEON_ARCH64_SSE
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_NEON_ARCH64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="hexa_hh_trafo_real_NEON_ARCH64_6hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+
+/*
+!f>#ifdef HAVE_SSE_INTRINSICS
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_SSE_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="hexa_hh_trafo_real_SSE_6hv_single")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_float)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#ifdef HAVE_SPARC64_SSE
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_SPARC64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="hexa_hh_trafo_real_SPARC64_6hv_single")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_float)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+/*
+!f>#ifdef HAVE_NEON_ARCH64_SSE
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_NEON_ARCH64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="hexa_hh_trafo_real_NEON_ARCH64_6hv_single")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_float)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#ifdef HAVE_VSX_SSE
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_VSX_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="hexa_hh_trafo_real_VSX_6hv_double")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_double)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#ifdef HAVE_VSX_SSE
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_VSX_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                                bind(C, name="hexa_hh_trafo_real_VSX_6hv_single")
+!f>        use, intrinsic :: iso_c_binding
+!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
+!f>        type(c_ptr), value        :: q
+!f>        real(kind=c_float)        :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_AVX_AVX2_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="hexa_hh_trafo_real_AVX_AVX2_6hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value      :: q
+!f>     real(kind=c_double)     :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_AVX_AVX2_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="hexa_hh_trafo_real_AVX_AVX2_6hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value      :: q
+!f>     real(kind=c_float)      :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_AVX512_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="hexa_hh_trafo_real_AVX512_6hv_double")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value      :: q
+!f>     real(kind=c_double)     :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+/*
+!f>#if defined(HAVE_AVX512)
+!f> interface
+!f>   subroutine hexa_hh_trafo_real_AVX512_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
+!f>                             bind(C, name="hexa_hh_trafo_real_AVX512_6hv_single")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
+!f>     type(c_ptr), value      :: q
+!f>     real(kind=c_float)      :: hh(pnb,6)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+void CONCAT_7ARGS(PREFIX,_hh_trafo_real_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq, int* pldh)
+{
+  int i;
+  int nb = *pnb;
+  int nq = *pldq;
+  int ldq = *pldq;
+  int ldh = *pldh;
+  int worked_on;
+
+  worked_on = 0;
+
+#ifdef BLOCK2
+  // calculating scalar product to compute
+  // 2 householder vectors simultaneously
+  DATA_TYPE s = hh[(ldh)+1]*1.0;
+#endif
+
+#ifdef BLOCK4
+  // calculating scalar products to compute
+  // 4 householder vectors simultaneously
+  DATA_TYPE s_1_2 = hh[(ldh)+1];  
+  DATA_TYPE s_1_3 = hh[(ldh*2)+2];
+  DATA_TYPE s_2_3 = hh[(ldh*2)+1];
+  DATA_TYPE s_1_4 = hh[(ldh*3)+3];
+  DATA_TYPE s_2_4 = hh[(ldh*3)+2];
+  DATA_TYPE s_3_4 = hh[(ldh*3)+1];
+
+  // calculate scalar product of first and fourth householder Vector
+  // loop counter = 2
+  s_1_2 += hh[2-1] * hh[(2+ldh)];          
+  s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];  
+  s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
+
+  // loop counter = 3
+  s_1_2 += hh[3-1] * hh[(3+ldh)];          
+  s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];  
+  s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
+
+  s_1_3 += hh[3-2] * hh[3+(ldh*2)];        
+  s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+  // calculating scalar products to compute
+  // 6 householder vectors simultaneously
+  DATA_TYPE scalarprods[15];
+
+  scalarprods[0] = hh[(ldh+1)];  
+  scalarprods[1] = hh[(ldh*2)+2];
+  scalarprods[2] = hh[(ldh*2)+1];
+  scalarprods[3] = hh[(ldh*3)+3];
+  scalarprods[4] = hh[(ldh*3)+2];
+  scalarprods[5] = hh[(ldh*3)+1];
+  scalarprods[6] = hh[(ldh*4)+4];
+  scalarprods[7] = hh[(ldh*4)+3];
+  scalarprods[8] = hh[(ldh*4)+2];
+  scalarprods[9] = hh[(ldh*4)+1];
+  scalarprods[10] = hh[(ldh*5)+5];
+  scalarprods[11] = hh[(ldh*5)+4];
+  scalarprods[12] = hh[(ldh*5)+3];
+  scalarprods[13] = hh[(ldh*5)+2];
+  scalarprods[14] = hh[(ldh*5)+1];
+
+  // calculate scalar product of first and fourth householder Vector
+  // loop counter = 2
+  scalarprods[0] += hh[1] * hh[(2+ldh)];           
+  scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];   
+  scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; 
+  scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; 
+  scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
+
+  // loop counter = 3
+  scalarprods[0] += hh[2] * hh[(3+ldh)];          
+  scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];  
+  scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
+  scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
+  scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
+
+  scalarprods[1] += hh[1] * hh[3+(ldh*2)];         
+  scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; 
+  scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; 
+  scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
+
+  // loop counter = 4
+  scalarprods[0] += hh[3] * hh[(4+ldh)];           
+  scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];   
+  scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; 
+  scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; 
+  scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
+
+  scalarprods[1] += hh[2] * hh[4+(ldh*2)];         
+  scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; 
+  scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; 
+  scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
+
+  scalarprods[3] += hh[1] * hh[4+(ldh*3)];         
+  scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];   
+  scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
+
+  // loop counter = 5
+  scalarprods[0] += hh[4] * hh[(5+ldh)];           
+  scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];   
+  scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; 
+  scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; 
+  scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
+
+  scalarprods[1] += hh[3] * hh[5+(ldh*2)];         
+  scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; 
+  scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; 
+  scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
+
+  scalarprods[3] += hh[2] * hh[5+(ldh*3)];         
+  scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];   
+  scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
+
+  scalarprods[6] += hh[1] * hh[5+(ldh*4)];         
+  scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];  
+
+
+#endif /* BLOCK6 */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
+  #pragma ivdep
+#endif
+  for (i = BLOCK; i < nb; i++)
+    {
+#ifdef BLOCK2
+      s += hh[i-1] * hh[(i+ldh)];
+#endif
+
+#ifdef BLOCK4
+      s_1_2 += hh[i-1] * hh[(i+ldh)];           
+      s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];   
+      s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; 
+
+      s_1_3 += hh[i-2] * hh[i+(ldh*2)];         
+      s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; 
+
+      s_1_4 += hh[i-3] * hh[i+(ldh*3)];         
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+      scalarprods[0] += hh[i-1] * hh[(i+ldh)];           
+      scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];   
+      scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; 
+      scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; 
+      scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
+
+      scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];         
+      scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; 
+      scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; 
+      scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
+
+      scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];         
+      scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];   
+      scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
+
+      scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];         
+      scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];  
+
+      scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];        
+#endif /* BLOCK6 */
+
+    }
+
+  // Production level kernel calls with padding
+#ifdef BLOCK2
+
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE  || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define STEP_SIZE 12
+#define ROW_LENGTH 12
+#define UPPER_BOUND 10
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define STEP_SIZE 24
+#define ROW_LENGTH 24
+#define UPPER_BOUND 20
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define STEP_SIZE 24
+#define ROW_LENGTH 24
+#define UPPER_BOUND 20
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define STEP_SIZE 48
+#define ROW_LENGTH 48
+#define UPPER_BOUND 40
+#endif
+#endif /*  VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define STEP_SIZE 32
+#define ROW_LENGTH 32
+#define UPPER_BOUND 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define STEP_SIZE 64
+#define ROW_LENGTH 64
+#define UPPER_BOUND 48
+#endif
+#endif /*  VEC_SET == AVX_512 */
+
+
+  for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE )
+    {
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_2hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+      worked_on += ROW_LENGTH;
+    }
+
+  if (nq == i)
+    {
+      return;
+    }
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 10
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 20
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE  || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 20
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 40
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 48
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+  if (nq-i == ROW_LENGTH)
+    {
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_2hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+      worked_on += ROW_LENGTH;
+    }
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE  || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+  if (nq-i == ROW_LENGTH)
+    {
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_2hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+      worked_on += ROW_LENGTH;
+    }
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 12
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_512 */
+  if (nq-i == ROW_LENGTH)
+    {
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_2hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+      worked_on += ROW_LENGTH;
+    }
+
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == AVX_256
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+
+  if (nq-i == ROW_LENGTH)
+    {
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_2hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+      worked_on += ROW_LENGTH;
+    }
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE  || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+  if (nq-i == ROW_LENGTH)
+    {
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_2hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
+      worked_on += ROW_LENGTH;
+    }
+
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE  || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 || VEC_SET == AVX_256 */
+
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 6
+#define STEP_SIZE 6
+#define UPPER_BOUND 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 12
+#define STEP_SIZE 12
+#define UPPER_BOUND 8
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 12
+#define STEP_SIZE 12
+#define UPPER_BOUND 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#define STEP_SIZE 24
+#define UPPER_BOUND 16
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#define STEP_SIZE 32
+#define UPPER_BOUND 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 64
+#define STEP_SIZE 64
+#define UPPER_BOUND 48
+#endif
+#endif /* VEC_SET == AVX_512 */
+  for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE )
+    {
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
+      worked_on += ROW_LENGTH;
+    }
+
+  if (nq == i)
+    {
+      return;
+    }
+
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 48
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+  if (nq-i == ROW_LENGTH )
+    {
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
+      worked_on += ROW_LENGTH;
+    }
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+   if (nq-i == ROW_LENGTH )
+     {
+       CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
+       worked_on += ROW_LENGTH;
+     }
+
+#if VEC_SET == AVX_512
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+   if (nq-i == ROW_LENGTH )
+     {
+       CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_4hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
+       worked_on += ROW_LENGTH;
+     }
+
+#endif /* VEC_SET == AVX_512 */
+
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#define STEP_SIZE 4
+#define UPPER_BOUND 2
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#define STEP_SIZE 8
+#define UPPER_BOUND 4
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE  || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#define STEP_SIZE 8
+#define UPPER_BOUND 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#define STEP_SIZE 16
+#define UPPER_BOUND 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#define STEP_SIZE 32
+#define UPPER_BOUND 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 64
+#define STEP_SIZE 64
+#define UPPER_BOUND 48
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+  for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
+    { 
+      CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
+      worked_on += ROW_LENGTH;
+    }
+    if (nq == i)
+      {
+        return;
+      }
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE  || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 48
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+    if (nq -i == ROW_LENGTH )
+      {
+        CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
+        worked_on += ROW_LENGTH;
+      }
+#if VEC_SET == AVX_512
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+    if (nq -i == ROW_LENGTH )
+      {
+        CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
+        worked_on += ROW_LENGTH;
+      }
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+    if (nq -i == ROW_LENGTH )
+      {
+        CONCAT_6ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_6hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, scalarprods);
+        worked_on += ROW_LENGTH;
+      }
+#endif /* VEC_SET == AVX_512 */
+
+#endif /* BLOCK6 */
+
+#ifdef WITH_DEBUG
+  if (worked_on != nq)
+    {
+      printf("Error in real SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
+      abort();
+    }
+#endif
+}
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 48
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 48
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 96
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+/*
+ * Unrolled kernel that computes
+ * ROW_LENGTH rows of Q simultaneously, a
+ * matrix Vector product with two householder
+ */
+#ifdef BLOCK2
+/*
+ * vectors + a rank 2 update is performed
+ */
+#endif
+#ifdef BLOCK4
+/*
+ * vectors + a rank 1 update is performed
+ */
+#endif
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh,
+#ifdef BLOCK2
+               DATA_TYPE s)
+#endif
+#ifdef BLOCK4
+               DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4)
+#endif 
+#ifdef BLOCK6
+               DATA_TYPE_PTR scalarprods)
+#endif
+  {
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [10 x nb+1] * hh
+    // hh contains two householder vectors, with offset 1
+    /////////////////////////////////////////////////////
+#endif
+#if defined(BLOCK4) || defined(BLOCK6)
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [10 x nb+3] * hh
+    // hh contains four householder vectors
+    /////////////////////////////////////////////////////
+#endif
+
+    int i;
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128
+    // Needed bit mask for floating point sign flip
+#ifdef DOUBLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == VSX_SSE
+    __SIMD_DATATYPE sign = vec_splats(-1.0);
+#endif
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+    __SIMD_DATATYPE x1 = _LOAD(&q[ldq]);
+    __SIMD_DATATYPE x2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE x3 = _LOAD(&q[ldq+2*offset]);
+    __SIMD_DATATYPE x4 = _LOAD(&q[ldq+3*offset]);
+    __SIMD_DATATYPE x5 = _LOAD(&q[ldq+4*offset]);
+    __SIMD_DATATYPE x6 = _LOAD(&q[ldq+5*offset]);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE  || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h1 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+ 
+    __SIMD_DATATYPE h2;
+#ifdef __ELPA_USE_FMA__
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_FMA(x1, h1, q1);
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE y2 = _SIMD_FMA(x2, h1, q2);
+    __SIMD_DATATYPE q3 = _LOAD(&q[2*offset]);
+    __SIMD_DATATYPE y3 = _SIMD_FMA(x3, h1, q3);
+    __SIMD_DATATYPE q4 = _LOAD(&q[3*offset]);
+    __SIMD_DATATYPE y4 = _SIMD_FMA(x4, h1, q4);
+    __SIMD_DATATYPE q5 = _LOAD(&q[4*offset]);
+    __SIMD_DATATYPE y5 = _SIMD_FMA(x5, h1, q5);
+    __SIMD_DATATYPE q6 = _LOAD(&q[5*offset]);
+    __SIMD_DATATYPE y6 = _SIMD_FMA(x6, h1, q6);
+#else
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE y2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+    __SIMD_DATATYPE q3 = _LOAD(&q[2*offset]);
+    __SIMD_DATATYPE y3 = _SIMD_ADD(q3, _SIMD_MUL(x3, h1));
+    __SIMD_DATATYPE q4 = _LOAD(&q[3*offset]);
+    __SIMD_DATATYPE y4 = _SIMD_ADD(q4, _SIMD_MUL(x4, h1));
+    __SIMD_DATATYPE q5 = _LOAD(&q[4*offset]);
+    __SIMD_DATATYPE y5 = _SIMD_ADD(q5, _SIMD_MUL(x5, h1));
+    __SIMD_DATATYPE q6 = _LOAD(&q[5*offset]);
+    __SIMD_DATATYPE y6 = _SIMD_ADD(q6, _SIMD_MUL(x6, h1));
+#endif
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+    register __SIMD_DATATYPE x1 = a1_1;
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));                          
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));                          
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));                          
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1));
+    register __SIMD_DATATYPE x1 = a1_1;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*3)+offset]);                  
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[0+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+    register __SIMD_DATATYPE x2 = a1_2;
+#else
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+    register __SIMD_DATATYPE x2 = a1_2;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_3 = _LOAD(&q[(ldq*3)+2*offset]);
+    __SIMD_DATATYPE a2_3 = _LOAD(&q[(ldq*2)+2*offset]);
+    __SIMD_DATATYPE a3_3 = _LOAD(&q[ldq+2*offset]);
+    __SIMD_DATATYPE a4_3 = _LOAD(&q[0+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
+    w3 = _SIMD_FMA(a2_3, h_4_2, w3);
+    w3 = _SIMD_FMA(a1_3, h_4_1, w3);
+    register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
+    z3 = _SIMD_FMA(a1_3, h_3_1, z3);
+    register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
+    register __SIMD_DATATYPE x3 = a1_3;
+#else
+    register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
+    register __SIMD_DATATYPE z3 = _SIMD_ADD(a3_3, _SIMD_MUL(a2_3, h_3_2));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
+    register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
+    register __SIMD_DATATYPE x3 = a1_3;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_4 = _LOAD(&q[(ldq*3)+3*offset]);
+    __SIMD_DATATYPE a2_4 = _LOAD(&q[(ldq*2)+3*offset]);
+    __SIMD_DATATYPE a3_4 = _LOAD(&q[ldq+3*offset]);
+    __SIMD_DATATYPE a4_4 = _LOAD(&q[0+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w4 = _SIMD_FMA(a3_4, h_4_3, a4_4);
+    w4 = _SIMD_FMA(a2_4, h_4_2, w4);
+    w4 = _SIMD_FMA(a1_4, h_4_1, w4);
+    register __SIMD_DATATYPE z4 = _SIMD_FMA(a2_4, h_3_2, a3_4);
+    z4 = _SIMD_FMA(a1_4, h_3_1, z4);
+    register __SIMD_DATATYPE y4 = _SIMD_FMA(a1_4, h_2_1, a2_4);
+    register __SIMD_DATATYPE x4 = a1_4;
+#else
+    register __SIMD_DATATYPE w4 = _SIMD_ADD(a4_4, _SIMD_MUL(a3_4, h_4_3));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a2_4, h_4_2));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a1_4, h_4_1));
+    register __SIMD_DATATYPE z4 = _SIMD_ADD(a3_4, _SIMD_MUL(a2_4, h_3_2));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(a1_4, h_3_1));
+    register __SIMD_DATATYPE y4 = _SIMD_ADD(a2_4, _SIMD_MUL(a1_4, h_2_1));
+    register __SIMD_DATATYPE x4 = a1_4;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_5 = _LOAD(&q[(ldq*3)+4*offset]);
+    __SIMD_DATATYPE a2_5 = _LOAD(&q[(ldq*2)+4*offset]);
+    __SIMD_DATATYPE a3_5 = _LOAD(&q[ldq+4*offset]);
+    __SIMD_DATATYPE a4_5 = _LOAD(&q[0+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w5 = _SIMD_FMA(a3_5, h_4_3, a4_5);
+    w5 = _SIMD_FMA(a2_5, h_4_2, w5);
+    w5 = _SIMD_FMA(a1_5, h_4_1, w5);
+    register __SIMD_DATATYPE z5 = _SIMD_FMA(a2_5, h_3_2, a3_5);
+    z5 = _SIMD_FMA(a1_5, h_3_1, z5);
+    register __SIMD_DATATYPE y5 = _SIMD_FMA(a1_5, h_2_1, a2_5);
+    register __SIMD_DATATYPE x5 = a1_5;
+#else
+    register __SIMD_DATATYPE w5 = _SIMD_ADD(a4_5, _SIMD_MUL(a3_5, h_4_3));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(a2_5, h_4_2));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(a1_5, h_4_1));
+    register __SIMD_DATATYPE z5 = _SIMD_ADD(a3_5, _SIMD_MUL(a2_5, h_3_2));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(a1_5, h_3_1));
+    register __SIMD_DATATYPE y5 = _SIMD_ADD(a2_5, _SIMD_MUL(a1_5, h_2_1));
+    register __SIMD_DATATYPE x5 = a1_5;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_6 = _LOAD(&q[(ldq*3)+5*offset]);
+    __SIMD_DATATYPE a2_6 = _LOAD(&q[(ldq*2)+5*offset]);
+    __SIMD_DATATYPE a3_6 = _LOAD(&q[ldq+5*offset]);
+    __SIMD_DATATYPE a4_6 = _LOAD(&q[0+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w6 = _SIMD_FMA(a3_6, h_4_3, a4_6);
+    w6 = _SIMD_FMA(a2_6, h_4_2, w6);
+    w6 = _SIMD_FMA(a1_6, h_4_1, w6);
+    register __SIMD_DATATYPE z6 = _SIMD_FMA(a2_6, h_3_2, a3_6);
+    z6 = _SIMD_FMA(a1_6, h_3_1, z6);
+    register __SIMD_DATATYPE y6 = _SIMD_FMA(a1_6, h_2_1, a2_6);
+    register __SIMD_DATATYPE x6 = a1_6;
+#else
+    register __SIMD_DATATYPE w6 = _SIMD_ADD(a4_6, _SIMD_MUL(a3_6, h_4_3));
+    w6 = _SIMD_ADD(w6, _SIMD_MUL(a2_6, h_4_2));
+    w6 = _SIMD_ADD(w6, _SIMD_MUL(a1_6, h_4_1));
+    register __SIMD_DATATYPE z6 = _SIMD_ADD(a3_6, _SIMD_MUL(a2_6, h_3_2));
+    z6 = _SIMD_ADD(z6, _SIMD_MUL(a1_6, h_3_1));
+    register __SIMD_DATATYPE y6 = _SIMD_ADD(a2_6, _SIMD_MUL(a1_6, h_2_1));
+    register __SIMD_DATATYPE x6 = a1_6;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+    __SIMD_DATATYPE q3;
+    __SIMD_DATATYPE q4;
+    __SIMD_DATATYPE q5;
+    __SIMD_DATATYPE q6;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+    
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*5]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*4]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a6_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET1(hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_6_5 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t1 = _SIMD_FMA(a5_1, h_6_5, a6_1);
+    t1 = _SIMD_FMA(a4_1, h_6_4, t1);
+    t1 = _SIMD_FMA(a3_1, h_6_3, t1);
+    t1 = _SIMD_FMA(a2_1, h_6_2, t1);
+    t1 = _SIMD_FMA(a1_1, h_6_1, t1);
+#else
+    register __SIMD_DATATYPE t1 = _SIMD_ADD(a6_1, _SIMD_MUL(a5_1, h_6_5)); 
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a4_1, h_6_4));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a3_1, h_6_3));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a2_1, h_6_2));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_5_4 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE v1 = _SIMD_FMA(a4_1, h_5_4, a5_1);
+    v1 = _SIMD_FMA(a3_1, h_5_3, v1);
+    v1 = _SIMD_FMA(a2_1, h_5_2, v1);
+    v1 = _SIMD_FMA(a1_1, h_5_1, v1);
+#else
+    register __SIMD_DATATYPE v1 = _SIMD_ADD(a5_1, _SIMD_MUL(a4_1, h_5_4)); 
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a3_1, h_5_3));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a2_1, h_5_2));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3)); 
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+#else
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1)); 
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x1 = a1_1;
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*5)+offset]);
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*4)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[(ldq*3)+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a5_2 = _LOAD(&q[(ldq)+offset]);
+    __SIMD_DATATYPE a6_2 = _LOAD(&q[offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t2 = _SIMD_FMA(a5_2, h_6_5, a6_2);
+    t2 = _SIMD_FMA(a4_2, h_6_4, t2);
+    t2 = _SIMD_FMA(a3_2, h_6_3, t2);
+    t2 = _SIMD_FMA(a2_2, h_6_2, t2);
+    t2 = _SIMD_FMA(a1_2, h_6_1, t2);
+    register __SIMD_DATATYPE v2 = _SIMD_FMA(a4_2, h_5_4, a5_2);
+    v2 = _SIMD_FMA(a3_2, h_5_3, v2);
+    v2 = _SIMD_FMA(a2_2, h_5_2, v2);
+    v2 = _SIMD_FMA(a1_2, h_5_1, v2);
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+#else
+    register __SIMD_DATATYPE t2 = _SIMD_ADD(a6_2, _SIMD_MUL(a5_2, h_6_5));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a4_2, h_6_4));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a3_2, h_6_3));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a2_2, h_6_2));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a1_2, h_6_1));
+    register __SIMD_DATATYPE v2 = _SIMD_ADD(a5_2, _SIMD_MUL(a4_2, h_5_4));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a3_2, h_5_3));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a2_2, h_5_2));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a1_2, h_5_1));
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x2 = a1_2;
+
+    __SIMD_DATATYPE a1_3 = _LOAD(&q[(ldq*5)+2*offset]);
+    __SIMD_DATATYPE a2_3 = _LOAD(&q[(ldq*4)+2*offset]);
+    __SIMD_DATATYPE a3_3 = _LOAD(&q[(ldq*3)+2*offset]);
+    __SIMD_DATATYPE a4_3 = _LOAD(&q[(ldq*2)+2*offset]);
+    __SIMD_DATATYPE a5_3 = _LOAD(&q[(ldq)+2*offset]);
+    __SIMD_DATATYPE a6_3 = _LOAD(&q[2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t3 = _SIMD_FMA(a5_3, h_6_5, a6_3);
+    t3 = _SIMD_FMA(a4_3, h_6_4, t3);
+    t3 = _SIMD_FMA(a3_3, h_6_3, t3);
+    t3 = _SIMD_FMA(a2_3, h_6_2, t3);
+    t3 = _SIMD_FMA(a1_3, h_6_1, t3);
+    register __SIMD_DATATYPE v3 = _SIMD_FMA(a4_3, h_5_4, a5_3);
+    v3 = _SIMD_FMA(a3_3, h_5_3, v3);
+    v3 = _SIMD_FMA(a2_3, h_5_2, v3);
+    v3 = _SIMD_FMA(a1_3, h_5_1, v3);
+    register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
+    w3 = _SIMD_FMA(a2_3, h_4_2, w3);
+    w3 = _SIMD_FMA(a1_3, h_4_1, w3);
+    register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
+    z3 = _SIMD_FMA(a1_3, h_3_1, z3);
+    register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
+#else
+    register __SIMD_DATATYPE t3 = _SIMD_ADD(a6_3, _SIMD_MUL(a5_3, h_6_5));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a4_3, h_6_4));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a3_3, h_6_3));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a2_3, h_6_2));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a1_3, h_6_1));
+    register __SIMD_DATATYPE v3 = _SIMD_ADD(a5_3, _SIMD_MUL(a4_3, h_5_4));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a3_3, h_5_3));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a2_3, h_5_2));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a1_3, h_5_1));
+    register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
+    register __SIMD_DATATYPE z3 = _SIMD_ADD(a3_3, _SIMD_MUL(a2_3, h_3_2));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
+    register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x3 = a1_3;
+
+    __SIMD_DATATYPE a1_4 = _LOAD(&q[(ldq*5)+3*offset]);
+    __SIMD_DATATYPE a2_4 = _LOAD(&q[(ldq*4)+3*offset]);
+    __SIMD_DATATYPE a3_4 = _LOAD(&q[(ldq*3)+3*offset]);
+    __SIMD_DATATYPE a4_4 = _LOAD(&q[(ldq*2)+3*offset]);
+    __SIMD_DATATYPE a5_4 = _LOAD(&q[(ldq)+3*offset]);
+    __SIMD_DATATYPE a6_4 = _LOAD(&q[3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t4 = _SIMD_FMA(a5_4, h_6_5, a6_4);
+    t4 = _SIMD_FMA(a4_4, h_6_4, t4);
+    t4 = _SIMD_FMA(a3_4, h_6_3, t4);
+    t4 = _SIMD_FMA(a2_4, h_6_2, t4);
+    t4 = _SIMD_FMA(a1_4, h_6_1, t4);
+    register __SIMD_DATATYPE v4 = _SIMD_FMA(a4_4, h_5_4, a5_4);
+    v4 = _SIMD_FMA(a3_4, h_5_3, v4);
+    v4 = _SIMD_FMA(a2_4, h_5_2, v4);
+    v4 = _SIMD_FMA(a1_4, h_5_1, v4);
+    register __SIMD_DATATYPE w4 = _SIMD_FMA(a3_4, h_4_3, a4_4);
+    w4 = _SIMD_FMA(a2_4, h_4_2, w4);
+    w4 = _SIMD_FMA(a1_4, h_4_1, w4);
+    register __SIMD_DATATYPE z4 = _SIMD_FMA(a2_4, h_3_2, a3_4);
+    z4 = _SIMD_FMA(a1_4, h_3_1, z4);
+    register __SIMD_DATATYPE y4 = _SIMD_FMA(a1_4, h_2_1, a2_4);
+#else
+    register __SIMD_DATATYPE t4 = _SIMD_ADD(a6_4, _SIMD_MUL(a5_4, h_6_5));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a4_4, h_6_4));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a3_4, h_6_3));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a2_4, h_6_2));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a1_4, h_6_1));
+    register __SIMD_DATATYPE v4 = _SIMD_ADD(a5_4, _SIMD_MUL(a4_4, h_5_4));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a3_4, h_5_3));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a2_4, h_5_2));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a1_4, h_5_1));
+    register __SIMD_DATATYPE w4 = _SIMD_ADD(a4_4, _SIMD_MUL(a3_4, h_4_3));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a2_4, h_4_2));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a1_4, h_4_1));
+    register __SIMD_DATATYPE z4 = _SIMD_ADD(a3_4, _SIMD_MUL(a2_4, h_3_2));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(a1_4, h_3_1));
+    register __SIMD_DATATYPE y4 = _SIMD_ADD(a2_4, _SIMD_MUL(a1_4, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x4 = a1_4;
+
+    __SIMD_DATATYPE a1_5 = _LOAD(&q[(ldq*5)+4*offset]);
+    __SIMD_DATATYPE a2_5 = _LOAD(&q[(ldq*4)+4*offset]);
+    __SIMD_DATATYPE a3_5 = _LOAD(&q[(ldq*3)+4*offset]);
+    __SIMD_DATATYPE a4_5 = _LOAD(&q[(ldq*2)+4*offset]);
+    __SIMD_DATATYPE a5_5 = _LOAD(&q[(ldq)+4*offset]);
+    __SIMD_DATATYPE a6_5 = _LOAD(&q[4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t5 = _SIMD_FMA(a5_5, h_6_5, a6_5);
+    t5 = _SIMD_FMA(a4_5, h_6_4, t5);
+    t5 = _SIMD_FMA(a3_5, h_6_3, t5);
+    t5 = _SIMD_FMA(a2_5, h_6_2, t5);
+    t5 = _SIMD_FMA(a1_5, h_6_1, t5);
+    register __SIMD_DATATYPE v5 = _SIMD_FMA(a4_5, h_5_4, a5_5);
+    v5 = _SIMD_FMA(a3_5, h_5_3, v5);
+    v5 = _SIMD_FMA(a2_5, h_5_2, v5);
+    v5 = _SIMD_FMA(a1_5, h_5_1, v5);
+    register __SIMD_DATATYPE w5 = _SIMD_FMA(a3_5, h_4_3, a4_5);
+    w5 = _SIMD_FMA(a2_5, h_4_2, w5);
+    w5 = _SIMD_FMA(a1_5, h_4_1, w5);
+    register __SIMD_DATATYPE z5 = _SIMD_FMA(a2_5, h_3_2, a3_5);
+    z5 = _SIMD_FMA(a1_5, h_3_1, z5);
+    register __SIMD_DATATYPE y5 = _SIMD_FMA(a1_5, h_2_1, a2_5);
+#else
+    register __SIMD_DATATYPE t5 = _SIMD_ADD(a6_5, _SIMD_MUL(a5_5, h_6_5));
+    t5 = _SIMD_ADD(t5, _SIMD_MUL(a4_5, h_6_4));
+    t5 = _SIMD_ADD(t5, _SIMD_MUL(a3_5, h_6_3));
+    t5 = _SIMD_ADD(t5, _SIMD_MUL(a2_5, h_6_2));
+    t5 = _SIMD_ADD(t5, _SIMD_MUL(a1_5, h_6_1));
+    register __SIMD_DATATYPE v5 = _SIMD_ADD(a5_5, _SIMD_MUL(a4_5, h_5_4));
+    v5 = _SIMD_ADD(v5, _SIMD_MUL(a3_5, h_5_3));
+    v5 = _SIMD_ADD(v5, _SIMD_MUL(a2_5, h_5_2));
+    v5 = _SIMD_ADD(v5, _SIMD_MUL(a1_5, h_5_1));
+    register __SIMD_DATATYPE w5 = _SIMD_ADD(a4_5, _SIMD_MUL(a3_5, h_4_3));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(a2_5, h_4_2));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(a1_5, h_4_1));
+    register __SIMD_DATATYPE z5 = _SIMD_ADD(a3_5, _SIMD_MUL(a2_5, h_3_2));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(a1_5, h_3_1));
+    register __SIMD_DATATYPE y5 = _SIMD_ADD(a2_5, _SIMD_MUL(a1_5, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x5 = a1_5;
+
+    __SIMD_DATATYPE a1_6 = _LOAD(&q[(ldq*5)+5*offset]);
+    __SIMD_DATATYPE a2_6 = _LOAD(&q[(ldq*4)+5*offset]);
+    __SIMD_DATATYPE a3_6 = _LOAD(&q[(ldq*3)+5*offset]);
+    __SIMD_DATATYPE a4_6 = _LOAD(&q[(ldq*2)+5*offset]);
+    __SIMD_DATATYPE a5_6 = _LOAD(&q[(ldq)+5*offset]);
+    __SIMD_DATATYPE a6_6 = _LOAD(&q[5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t6 = _SIMD_FMA(a5_6, h_6_5, a6_6);
+    t6 = _SIMD_FMA(a4_6, h_6_4, t6);
+    t6 = _SIMD_FMA(a3_6, h_6_3, t6);
+    t6 = _SIMD_FMA(a2_6, h_6_2, t6);
+    t6 = _SIMD_FMA(a1_6, h_6_1, t6);
+    register __SIMD_DATATYPE v6 = _SIMD_FMA(a4_6, h_5_4, a5_6);
+    v6 = _SIMD_FMA(a3_6, h_5_3, v6);
+    v6 = _SIMD_FMA(a2_6, h_5_2, v6);
+    v6 = _SIMD_FMA(a1_6, h_5_1, v6);
+    register __SIMD_DATATYPE w6 = _SIMD_FMA(a3_6, h_4_3, a4_6);
+    w6 = _SIMD_FMA(a2_6, h_4_2, w6);
+    w6 = _SIMD_FMA(a1_6, h_4_1, w6);
+    register __SIMD_DATATYPE z6 = _SIMD_FMA(a2_6, h_3_2, a3_6);
+    z6 = _SIMD_FMA(a1_6, h_3_1, z6);
+    register __SIMD_DATATYPE y6 = _SIMD_FMA(a1_6, h_2_1, a2_6);
+#else
+    register __SIMD_DATATYPE t6 = _SIMD_ADD(a6_6, _SIMD_MUL(a5_6, h_6_5));
+    t6 = _SIMD_ADD(t6, _SIMD_MUL(a4_6, h_6_4));
+    t6 = _SIMD_ADD(t6, _SIMD_MUL(a3_6, h_6_3));
+    t6 = _SIMD_ADD(t6, _SIMD_MUL(a2_6, h_6_2));
+    t6 = _SIMD_ADD(t6, _SIMD_MUL(a1_6, h_6_1));
+    register __SIMD_DATATYPE v6 = _SIMD_ADD(a5_6, _SIMD_MUL(a4_6, h_5_4));
+    v6 = _SIMD_ADD(v6, _SIMD_MUL(a3_6, h_5_3));
+    v6 = _SIMD_ADD(v6, _SIMD_MUL(a2_6, h_5_2));
+    v6 = _SIMD_ADD(v6, _SIMD_MUL(a1_6, h_5_1));
+    register __SIMD_DATATYPE w6 = _SIMD_ADD(a4_6, _SIMD_MUL(a3_6, h_4_3));
+    w6 = _SIMD_ADD(w6, _SIMD_MUL(a2_6, h_4_2));
+    w6 = _SIMD_ADD(w6, _SIMD_MUL(a1_6, h_4_1));
+    register __SIMD_DATATYPE z6 = _SIMD_ADD(a3_6, _SIMD_MUL(a2_6, h_3_2));
+    z6 = _SIMD_ADD(z6, _SIMD_MUL(a1_6, h_3_1));
+    register __SIMD_DATATYPE y6 = _SIMD_ADD(a2_6, _SIMD_MUL(a1_6, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+ 
+    register __SIMD_DATATYPE x6 = a1_6;
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+    __SIMD_DATATYPE q3;
+    __SIMD_DATATYPE q4;
+    __SIMD_DATATYPE q5;
+    __SIMD_DATATYPE q6;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+    __SIMD_DATATYPE h5;
+    __SIMD_DATATYPE h6;
+
+#endif /* BLOCK6 */
+
+
+    for(i = BLOCK; i < nb; i++)
+      {
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if  VEC_SET == AVX_256
+        h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+        h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif /*   VEC_SET == AVX_256 */
+
+        q1 = _LOAD(&q[i*ldq]);
+        q2 = _LOAD(&q[(i*ldq)+offset]);
+        q3 = _LOAD(&q[(i*ldq)+2*offset]);
+        q4 = _LOAD(&q[(i*ldq)+3*offset]);
+        q5 = _LOAD(&q[(i*ldq)+4*offset]);
+        q6 = _LOAD(&q[(i*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+        x1 = _SIMD_FMA(q1, h1, x1);
+        y1 = _SIMD_FMA(q1, h2, y1);
+        x2 = _SIMD_FMA(q2, h1, x2);
+        y2 = _SIMD_FMA(q2, h2, y2);
+        x3 = _SIMD_FMA(q3, h1, x3);
+        y3 = _SIMD_FMA(q3, h2, y3);
+        x4 = _SIMD_FMA(q4, h1, x4);
+        y4 = _SIMD_FMA(q4, h2, y4);
+        x5 = _SIMD_FMA(q5, h1, x5);
+        y5 = _SIMD_FMA(q5, h2, y5);
+        x6 = _SIMD_FMA(q6, h1, x6);
+        y6 = _SIMD_FMA(q6, h2, y6);
+#else
+        x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+        y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+        x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+        y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+        x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+        y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+        x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+        y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+        x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+        y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+        x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
+        y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        z1 = _SIMD_FMA(q1, h3, z1);
+        z2 = _SIMD_FMA(q2, h3, z2);
+        z3 = _SIMD_FMA(q3, h3, z3);
+        z4 = _SIMD_FMA(q4, h3, z4);
+        z5 = _SIMD_FMA(q5, h3, z5);
+        z6 = _SIMD_FMA(q6, h3, z6);
+#else
+        z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+        z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+        z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+        z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+        z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
+        z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        w1 = _SIMD_FMA(q1, h4, w1);
+        w2 = _SIMD_FMA(q2, h4, w2);
+        w3 = _SIMD_FMA(q3, h4, w3);
+        w4 = _SIMD_FMA(q4, h4, w4);
+        w5 = _SIMD_FMA(q5, h4, w5);
+        w6 = _SIMD_FMA(q6, h4, w6);
+#else
+        w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+        w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+        w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+        w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+        w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
+        w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4));
+#endif /* __ELPA_USE_FMA__ */
+				
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == AVX_256
+        h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        v1 = _SIMD_FMA(q1, h5, v1);
+        v2 = _SIMD_FMA(q2, h5, v2);
+        v3 = _SIMD_FMA(q3, h5, v3);
+        v4 = _SIMD_FMA(q4, h5, v4);
+        v5 = _SIMD_FMA(q5, h5, v5);
+        v6 = _SIMD_FMA(q6, h5, v6);
+#else
+        v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+        v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+        v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5));
+        v4 = _SIMD_ADD(v4, _SIMD_MUL(q4,h5));
+        v5 = _SIMD_ADD(v5, _SIMD_MUL(q5,h5));
+        v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+
+
+#if VEC_SET == AVX_256
+        h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        t1 = _SIMD_FMA(q1, h6, t1);
+        t2 = _SIMD_FMA(q2, h6, t2);
+        t3 = _SIMD_FMA(q3, h6, t3);
+        t4 = _SIMD_FMA(q4, h6, t4);
+        t5 = _SIMD_FMA(q5, h6, t5);
+        t6 = _SIMD_FMA(q6, h6, t6);
+#else
+        t1 = _SIMD_ADD(t1, _SIMD_MUL(q1,h6));
+        t2 = _SIMD_ADD(t2, _SIMD_MUL(q2,h6));
+        t3 = _SIMD_ADD(t3, _SIMD_MUL(q3,h6));
+        t4 = _SIMD_ADD(t4, _SIMD_MUL(q4,h6));
+        t5 = _SIMD_ADD(t5, _SIMD_MUL(q5,h6));
+        t6 = _SIMD_ADD(t6, _SIMD_MUL(q6,h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+      }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+    q1 = _LOAD(&q[nb*ldq]);
+    q2 = _LOAD(&q[(nb*ldq)+offset]);
+    q3 = _LOAD(&q[(nb*ldq)+2*offset]);
+    q4 = _LOAD(&q[(nb*ldq)+3*offset]);
+    q5 = _LOAD(&q[(nb*ldq)+4*offset]);
+    q6 = _LOAD(&q[(nb*ldq)+5*offset]);
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+    x6 = _SIMD_FMA(q6, h1, x6);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+    x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+    
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+    y6 = _SIMD_FMA(q6, h2, y6);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+    y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+    z5 = _SIMD_FMA(q5, h3, z5);
+    z6 = _SIMD_FMA(q6, h3, z6);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
+    z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
+#endif
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+1)*ldq)+4*offset]);
+    q6 = _LOAD(&q[((nb+1)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+    x6 = _SIMD_FMA(q6, h1, x6);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+    x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[(ldh*1)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+    y6 = _SIMD_FMA(q6, h2, y6);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+    y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+2)*ldq)+4*offset]);
+    q6 = _LOAD(&q[((nb+2)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+    x6 = _SIMD_FMA(q6, h1, x6);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+    x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 */
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+    w3 = _SIMD_FMA(q3, h4, w3);
+    w4 = _SIMD_FMA(q4, h4, w4);
+    w5 = _SIMD_FMA(q5, h4, w5);
+    w6 = _SIMD_FMA(q6, h4, w6);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); 
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
+    w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    v1 = _SIMD_FMA(q1, h5, v1);
+    v2 = _SIMD_FMA(q2, h5, v2);
+    v3 = _SIMD_FMA(q3, h5, v3);
+    v4 = _SIMD_FMA(q4, h5, v4);
+    v5 = _SIMD_FMA(q5, h5, v5);
+    v6 = _SIMD_FMA(q6, h5, v6);
+#else
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(q4,h5));
+    v5 = _SIMD_ADD(v5, _SIMD_MUL(q5,h5));
+    v6 = _SIMD_ADD(v6, _SIMD_MUL(q6,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-4], hh[nb-4]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+1)*ldq)+4*offset]);
+    q6 = _LOAD(&q[((nb+1)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+    x6 = _SIMD_FMA(q6, h1, x6);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+    x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-3], hh[ldh+nb-3]);
+#endif
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+    y6 = _SIMD_FMA(q6, h2, y6);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+    y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+    z5 = _SIMD_FMA(q5, h3, z5);
+    z6 = _SIMD_FMA(q6, h3, z6);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
+    z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+    w3 = _SIMD_FMA(q3, h4, w3);
+    w4 = _SIMD_FMA(q4, h4, w4);
+    w5 = _SIMD_FMA(q5, h4, w5);
+    w6 = _SIMD_FMA(q6, h4, w6);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
+    w6 = _SIMD_ADD(w6, _SIMD_MUL(q6,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-3], hh[nb-3]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-3]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+2)*ldq)+4*offset]);
+    q6 = _LOAD(&q[((nb+2)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+    x6 = _SIMD_FMA(q6, h1, x6);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+    x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+    y6 = _SIMD_FMA(q6, h2, y6);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+    y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
+#endif /* __ELPA_USE_FMA__ */
+ 
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+    z5 = _SIMD_FMA(q5, h3, z5);
+    z6 = _SIMD_FMA(q6, h3, z6);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
+    z6 = _SIMD_ADD(z6, _SIMD_MUL(q6,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-4)]);
+#endif
+    q1 = _LOAD(&q[(nb+3)*ldq]);
+    q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+3)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+3)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+3)*ldq)+4*offset]);
+    q6 = _LOAD(&q[((nb+3)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+    x6 = _SIMD_FMA(q6, h1, x6);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+    x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+    y6 = _SIMD_FMA(q6, h2, y6);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+    y6 = _SIMD_ADD(y6, _SIMD_MUL(q6,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-5)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+4)*ldq]);
+    q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+4)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+4)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+4)*ldq)+4*offset]);
+    q6 = _LOAD(&q[((nb+4)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+    x6 = _SIMD_FMA(q6, h1, x6);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+    x6 = _SIMD_ADD(x6, _SIMD_MUL(q6,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Rank-2 update of Q [ROW_LENGTH x nb+1]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK4
+    /////////////////////////////////////////////////////
+    // Rank-1 update of Q [6 x nb+3]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK6
+    /////////////////////////////////////////////////////
+    // Apply tau, correct wrong calculation using pre-calculated scalar products
+    /////////////////////////////////////////////////////
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]);
+
+    __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET1(hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET1(hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET1(hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET1(hh[ldh*5]);       
+#endif
+
+#ifdef BLOCK2    
+    __SIMD_DATATYPE vs = _SIMD_SET1(s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(s_1_3);  
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(s_1_4);  
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(s_2_4);  
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET1(scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET1(scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET1(scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET1(scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET1(scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET1(scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET1(scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET1(scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET1(scalarprods[14]);
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_512  */
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE tau1 = _SIMD_SET(hh[0], hh[0]);
+
+    __SIMD_DATATYPE tau2 = _SIMD_SET(hh[ldh], hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET(hh[ldh*2], hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET(hh[ldh*3], hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET(hh[ldh*4], hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET(hh[ldh*5], hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2
+    __SIMD_DATATYPE vs = _SIMD_SET(s, s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(s_1_2, s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(s_1_3, s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(s_2_3, s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(s_1_4, s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(s_2_4, s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(s_3_4, s_3_4);
+
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(scalarprods[0], scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(scalarprods[1], scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(scalarprods[2], scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(scalarprods[3], scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(scalarprods[4], scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(scalarprods[5], scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET(scalarprods[6], scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET(scalarprods[7], scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET(scalarprods[8], scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET(scalarprods[9], scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET(scalarprods[10], scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET(scalarprods[11], scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET(scalarprods[12], scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET(scalarprods[13], scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET(scalarprods[14], scalarprods[14]);
+#endif
+#endif /* VEC_SET == SPARC64_SSE */
+
+#if VEC_SET == AVX_256
+   __SIMD_DATATYPE tau1 = _SIMD_BROADCAST(hh);
+   __SIMD_DATATYPE tau2 = _SIMD_BROADCAST(&hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_BROADCAST(&hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_BROADCAST(&hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_BROADCAST(&hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_BROADCAST(&hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2  
+   __SIMD_DATATYPE vs = _SIMD_BROADCAST(&s);
+#endif
+
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_BROADCAST(&scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_BROADCAST(&scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_BROADCAST(&scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_BROADCAST(&scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_BROADCAST(&scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_BROADCAST(&scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_BROADCAST(&scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_BROADCAST(&scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_BROADCAST(&scalarprods[14]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+    h1 = _XOR(tau1, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_NEG(tau1);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau1, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau1;
+#endif
+
+   x1 = _SIMD_MUL(x1, h1);
+   x2 = _SIMD_MUL(x2, h1);
+   x3 = _SIMD_MUL(x3, h1);
+   x4 = _SIMD_MUL(x4, h1);
+   x5 = _SIMD_MUL(x5, h1);
+   x6 = _SIMD_MUL(x6, h1);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+   h1 = _XOR(tau2, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_NEG(tau2);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau2, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+   h2 = _SIMD_MUL(h1, vs);
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau2;
+   h2 = _SIMD_MUL(h1, vs_1_2);
+#endif /* BLOCK4 || BLOCK6  */
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMA(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMA(y2, h1, _SIMD_MUL(x2,h2));
+   y3 = _SIMD_FMA(y3, h1, _SIMD_MUL(x3,h2));
+   y4 = _SIMD_FMA(y4, h1, _SIMD_MUL(x4,h2));
+   y5 = _SIMD_FMA(y5, h1, _SIMD_MUL(x5,h2));
+   y6 = _SIMD_FMA(y6, h1, _SIMD_MUL(x6,h2));
+#else
+   y1 = _SIMD_ADD(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_ADD(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+   y3 = _SIMD_ADD(_SIMD_MUL(y3,h1), _SIMD_MUL(x3,h2));
+   y4 = _SIMD_ADD(_SIMD_MUL(y4,h1), _SIMD_MUL(x4,h2));
+   y5 = _SIMD_ADD(_SIMD_MUL(y5,h1), _SIMD_MUL(x5,h2));
+   y6 = _SIMD_ADD(_SIMD_MUL(y6,h1), _SIMD_MUL(x6,h2));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMSUB(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMSUB(y2, h1, _SIMD_MUL(x2,h2));
+   y3 = _SIMD_FMSUB(y3, h1, _SIMD_MUL(x3,h2));
+   y4 = _SIMD_FMSUB(y4, h1, _SIMD_MUL(x4,h2));
+   y5 = _SIMD_FMSUB(y5, h1, _SIMD_MUL(x5,h2));
+   y6 = _SIMD_FMSUB(y6, h1, _SIMD_MUL(x6,h2));
+#else   
+   y1 = _SIMD_SUB(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_SUB(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+   y3 = _SIMD_SUB(_SIMD_MUL(y3,h1), _SIMD_MUL(x3,h2));
+   y4 = _SIMD_SUB(_SIMD_MUL(y4,h1), _SIMD_MUL(x4,h2));
+   y5 = _SIMD_SUB(_SIMD_MUL(y5,h1), _SIMD_MUL(x5,h2));
+   y6 = _SIMD_SUB(_SIMD_MUL(y6,h1), _SIMD_MUL(x6,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau3;
+   h2 = _SIMD_MUL(h1, vs_1_3);
+   h3 = _SIMD_MUL(h1, vs_2_3);
+
+#ifdef __ELPA_USE_FMA__
+   z1 = _SIMD_FMSUB(z1, h1, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_FMSUB(z2, h1, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)));
+   z3 = _SIMD_FMSUB(z3, h1, _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2)));
+   z4 = _SIMD_FMSUB(z4, h1, _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2)));
+   z5 = _SIMD_FMSUB(z5, h1, _SIMD_FMA(y5, h3, _SIMD_MUL(x5,h2)));
+   z6 = _SIMD_FMSUB(z6, h1, _SIMD_FMA(y6, h3, _SIMD_MUL(x6,h2)));
+#else
+   z1 = _SIMD_SUB(_SIMD_MUL(z1,h1), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_SUB(_SIMD_MUL(z2,h1), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)));
+   z3 = _SIMD_SUB(_SIMD_MUL(z3,h1), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2)));
+   z4 = _SIMD_SUB(_SIMD_MUL(z4,h1), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2)));
+   z5 = _SIMD_SUB(_SIMD_MUL(z5,h1), _SIMD_ADD(_SIMD_MUL(y5,h3), _SIMD_MUL(x5,h2)));
+   z6 = _SIMD_SUB(_SIMD_MUL(z6,h1), _SIMD_ADD(_SIMD_MUL(y6,h3), _SIMD_MUL(x6,h2)));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau4;
+   h2 = _SIMD_MUL(h1, vs_1_4);
+   h3 = _SIMD_MUL(h1, vs_2_4);
+   h4 = _SIMD_MUL(h1, vs_3_4);
+
+#ifdef __ELPA_USE_FMA__
+   w1 = _SIMD_FMSUB(w1, h1, _SIMD_FMA(z1, h4, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   w2 = _SIMD_FMSUB(w2, h1, _SIMD_FMA(z2, h4, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+   w3 = _SIMD_FMSUB(w3, h1, _SIMD_FMA(z3, h4, _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2))));
+   w4 = _SIMD_FMSUB(w4, h1, _SIMD_FMA(z4, h4, _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2))));
+   w5 = _SIMD_FMSUB(w5, h1, _SIMD_FMA(z5, h4, _SIMD_FMA(y5, h3, _SIMD_MUL(x5,h2))));
+   w6 = _SIMD_FMSUB(w6, h1, _SIMD_FMA(z6, h4, _SIMD_FMA(y6, h3, _SIMD_MUL(x6,h2))));
+#else
+   w1 = _SIMD_SUB(_SIMD_MUL(w1,h1), _SIMD_ADD(_SIMD_MUL(z1,h4), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+   w2 = _SIMD_SUB(_SIMD_MUL(w2,h1), _SIMD_ADD(_SIMD_MUL(z2,h4), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+   w3 = _SIMD_SUB(_SIMD_MUL(w3,h1), _SIMD_ADD(_SIMD_MUL(z3,h4), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2))));
+   w4 = _SIMD_SUB(_SIMD_MUL(w4,h1), _SIMD_ADD(_SIMD_MUL(z4,h4), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2))));
+   w5 = _SIMD_SUB(_SIMD_MUL(w5,h1), _SIMD_ADD(_SIMD_MUL(z5,h4), _SIMD_ADD(_SIMD_MUL(y5,h3), _SIMD_MUL(x5,h2))));
+   w6 = _SIMD_SUB(_SIMD_MUL(w6,h1), _SIMD_ADD(_SIMD_MUL(z6,h4), _SIMD_ADD(_SIMD_MUL(y6,h3), _SIMD_MUL(x6,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+   h2 = _SIMD_MUL(tau5, vs_1_5); 
+   h3 = _SIMD_MUL(tau5, vs_2_5);
+   h4 = _SIMD_MUL(tau5, vs_3_5);
+   h5 = _SIMD_MUL(tau5, vs_4_5);
+
+#ifdef __ELPA_USE_FMA__
+   v1 = _SIMD_FMSUB(v1, tau5, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_FMSUB(v2, tau5, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+   v3 = _SIMD_FMSUB(v3, tau5, _SIMD_ADD(_SIMD_FMA(w3, h5, _SIMD_MUL(z3,h4)), _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2))));
+   v4 = _SIMD_FMSUB(v4, tau5, _SIMD_ADD(_SIMD_FMA(w4, h5, _SIMD_MUL(z4,h4)), _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2))));
+   v5 = _SIMD_FMSUB(v5, tau5, _SIMD_ADD(_SIMD_FMA(w5, h5, _SIMD_MUL(z5,h4)), _SIMD_FMA(y5, h3, _SIMD_MUL(x5,h2))));
+   v6 = _SIMD_FMSUB(v6, tau5, _SIMD_ADD(_SIMD_FMA(w6, h5, _SIMD_MUL(z6,h4)), _SIMD_FMA(y6, h3, _SIMD_MUL(x6,h2))));
+#else
+   v1 = _SIMD_SUB(_SIMD_MUL(v1,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_SUB(_SIMD_MUL(v2,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+   v3 = _SIMD_SUB(_SIMD_MUL(v3,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w3,h5), _SIMD_MUL(z3,h4)), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2))));
+   v4 = _SIMD_SUB(_SIMD_MUL(v4,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w4,h5), _SIMD_MUL(z4,h4)), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2))));
+   v5 = _SIMD_SUB(_SIMD_MUL(v5,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w5,h5), _SIMD_MUL(z5,h4)), _SIMD_ADD(_SIMD_MUL(y5,h3), _SIMD_MUL(x5,h2))));
+   v6 = _SIMD_SUB(_SIMD_MUL(v6,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w6,h5), _SIMD_MUL(z6,h4)), _SIMD_ADD(_SIMD_MUL(y6,h3), _SIMD_MUL(x6,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+   h2 = _SIMD_MUL(tau6, vs_1_6);
+   h3 = _SIMD_MUL(tau6, vs_2_6);
+   h4 = _SIMD_MUL(tau6, vs_3_6);
+   h5 = _SIMD_MUL(tau6, vs_4_6);
+   h6 = _SIMD_MUL(tau6, vs_5_6);
+
+#ifdef __ELPA_USE_FMA__
+   t1 = _SIMD_FMSUB(t1, tau6, _SIMD_FMA(v1, h6, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_FMSUB(t2, tau6, _SIMD_FMA(v2, h6, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)))));
+   t3 = _SIMD_FMSUB(t3, tau6, _SIMD_FMA(v3, h6, _SIMD_ADD(_SIMD_FMA(w3, h5, _SIMD_MUL(z3,h4)), _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2)))));
+   t4 = _SIMD_FMSUB(t4, tau6, _SIMD_FMA(v4, h6, _SIMD_ADD(_SIMD_FMA(w4, h5, _SIMD_MUL(z4,h4)), _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2)))));
+   t5 = _SIMD_FMSUB(t5, tau6, _SIMD_FMA(v5, h6, _SIMD_ADD(_SIMD_FMA(w5, h5, _SIMD_MUL(z5,h4)), _SIMD_FMA(y5, h3, _SIMD_MUL(x5,h2)))));
+   t6 = _SIMD_FMSUB(t6, tau6, _SIMD_FMA(v6, h6, _SIMD_ADD(_SIMD_FMA(w6, h5, _SIMD_MUL(z6,h4)), _SIMD_FMA(y6, h3, _SIMD_MUL(x6,h2)))));
+#else
+   t1 = _SIMD_SUB(_SIMD_MUL(t1,tau6), _SIMD_ADD( _SIMD_MUL(v1,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_SUB(_SIMD_MUL(t2,tau6), _SIMD_ADD( _SIMD_MUL(v2,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)))));
+   t3 = _SIMD_SUB(_SIMD_MUL(t3,tau6), _SIMD_ADD( _SIMD_MUL(v3,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w3,h5), _SIMD_MUL(z3,h4)), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2)))));
+   t4 = _SIMD_SUB(_SIMD_MUL(t4,tau6), _SIMD_ADD( _SIMD_MUL(v4,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w4,h5), _SIMD_MUL(z4,h4)), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2)))));
+   t5 = _SIMD_SUB(_SIMD_MUL(t5,tau6), _SIMD_ADD( _SIMD_MUL(v5,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w5,h5), _SIMD_MUL(z5,h4)), _SIMD_ADD(_SIMD_MUL(y5,h3), _SIMD_MUL(x5,h2)))));
+   t6 = _SIMD_SUB(_SIMD_MUL(t6,tau6), _SIMD_ADD( _SIMD_MUL(v6,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w6,h5), _SIMD_MUL(z6,h4)), _SIMD_ADD(_SIMD_MUL(y6,h3), _SIMD_MUL(x6,h2)))));
+#endif /* __ELPA_USE_FMA__ */
+
+   /////////////////////////////////////////////////////
+   // Rank-1 update of Q [ROW_LENGTH x nb+3]
+   /////////////////////////////////////////////////////
+#endif /* BLOCK6 */
+
+   q1 = _LOAD(&q[0]);
+#ifdef BLOCK2
+   q1 = _SIMD_ADD(q1, y1);
+#endif
+#ifdef BLOCK4
+   q1 = _SIMD_SUB(q1, w1);
+#endif
+#ifdef BLOCK6
+   q1 = _SIMD_SUB(q1, t1); 
+#endif
+   _STORE(&q[0],q1);
+   q2 = _LOAD(&q[offset]);
+#ifdef BLOCK2
+   q2 = _SIMD_ADD(q2, y2);
+#endif
+#ifdef BLOCK4
+   q2 = _SIMD_SUB(q2, w2);
+#endif
+#ifdef BLOCK6
+   q2 = _SIMD_SUB(q2, t2);
+#endif
+   _STORE(&q[offset],q2);
+   q3 = _LOAD(&q[2*offset]);
+#ifdef BLOCK2
+   q3 = _SIMD_ADD(q3, y3);
+#endif
+#ifdef BLOCK4
+   q3 = _SIMD_SUB(q3, w3);
+#endif
+#ifdef BLOCK6
+   q3 = _SIMD_SUB(q3, t3);
+#endif
+   _STORE(&q[2*offset],q3);
+   q4 = _LOAD(&q[3*offset]);
+#ifdef BLOCK2
+   q4 = _SIMD_ADD(q4, y4);
+#endif
+#ifdef BLOCK4
+   q4 = _SIMD_SUB(q4, w4);
+#endif
+#ifdef BLOCK6
+   q4 = _SIMD_SUB(q4, t4);
+#endif
+   _STORE(&q[3*offset],q4);
+   q5 = _LOAD(&q[4*offset]);
+#ifdef BLOCK2
+   q5 = _SIMD_ADD(q5, y5);
+#endif
+#ifdef BLOCK4
+   q5 = _SIMD_SUB(q5, w5);
+#endif
+#ifdef BLOCK6
+   q5 = _SIMD_SUB(q5, t5);
+#endif
+   _STORE(&q[4*offset],q5);
+   q6 = _LOAD(&q[5*offset]);
+#ifdef BLOCK2
+   q6 = _SIMD_ADD(q6, y6);
+#endif
+#ifdef BLOCK4
+   q6 = _SIMD_SUB(q6, w6);
+#endif
+#ifdef BLOCK6
+   q6 = _SIMD_SUB(q6, t6);
+#endif
+   _STORE(&q[5*offset],q6);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+   q3 = _LOAD(&q[ldq+2*offset]);
+   q4 = _LOAD(&q[ldq+3*offset]);
+   q5 = _LOAD(&q[ldq+4*offset]);
+   q6 = _LOAD(&q[ldq+5*offset]);
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_ADD(q1, _SIMD_FMA(y1, h2, x1));
+   q2 = _SIMD_ADD(q2, _SIMD_FMA(y2, h2, x2));
+   q3 = _SIMD_ADD(q3, _SIMD_FMA(y3, h2, x3));
+   q4 = _SIMD_ADD(q4, _SIMD_FMA(y4, h2, x4));
+   q5 = _SIMD_ADD(q5, _SIMD_FMA(y5, h2, x5));
+   q6 = _SIMD_ADD(q6, _SIMD_FMA(y6, h2, x6));
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_ADD(x1, _SIMD_MUL(y1, h2)));
+   q2 = _SIMD_ADD(q2, _SIMD_ADD(x2, _SIMD_MUL(y2, h2)));
+   q3 = _SIMD_ADD(q3, _SIMD_ADD(x3, _SIMD_MUL(y3, h2)));
+   q4 = _SIMD_ADD(q4, _SIMD_ADD(x4, _SIMD_MUL(y4, h2)));
+   q5 = _SIMD_ADD(q5, _SIMD_ADD(x5, _SIMD_MUL(y5, h2)));
+   q6 = _SIMD_ADD(q6, _SIMD_ADD(x6, _SIMD_MUL(y6, h2)));
+#endif /* __ELPA_USE_FMA__ */
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+   _STORE(&q[ldq+2*offset],q3);
+   _STORE(&q[ldq+3*offset],q4);
+   _STORE(&q[ldq+4*offset],q5);
+   _STORE(&q[ldq+5*offset],q6);
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+   q3 = _LOAD(&q[ldq+2*offset]);
+   q4 = _LOAD(&q[ldq+3*offset]);
+   q5 = _LOAD(&q[ldq+4*offset]);
+   q6 = _LOAD(&q[ldq+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_SUB(q1, _SIMD_FMA(w1, h4, z1));
+   q2 = _SIMD_SUB(q2, _SIMD_FMA(w2, h4, z2));
+   q3 = _SIMD_SUB(q3, _SIMD_FMA(w3, h4, z3));
+   q4 = _SIMD_SUB(q4, _SIMD_FMA(w4, h4, z4));
+   q5 = _SIMD_SUB(q5, _SIMD_FMA(w5, h4, z5));
+   q6 = _SIMD_SUB(q6, _SIMD_FMA(w6, h4, z6));
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_ADD(z1, _SIMD_MUL(w1, h4)));
+   q2 = _SIMD_SUB(q2, _SIMD_ADD(z2, _SIMD_MUL(w2, h4)));
+   q3 = _SIMD_SUB(q3, _SIMD_ADD(z3, _SIMD_MUL(w3, h4)));
+   q4 = _SIMD_SUB(q4, _SIMD_ADD(z4, _SIMD_MUL(w4, h4)));
+   q5 = _SIMD_SUB(q5, _SIMD_ADD(z5, _SIMD_MUL(w5, h4)));
+   q6 = _SIMD_SUB(q6, _SIMD_ADD(z6, _SIMD_MUL(w6, h4)));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+   _STORE(&q[ldq+2*offset],q3);
+   _STORE(&q[ldq+3*offset],q4);
+   _STORE(&q[ldq+4*offset],q5);
+   _STORE(&q[ldq+5*offset],q6);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q3 = _LOAD(&q[(ldq*2)+2*offset]);
+   q4 = _LOAD(&q[(ldq*2)+3*offset]);
+   q5 = _LOAD(&q[(ldq*2)+4*offset]);
+   q6 = _LOAD(&q[(ldq*2)+5*offset]);
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+   q3 = _SIMD_SUB(q3, y3);
+   q4 = _SIMD_SUB(q4, y4);
+   q5 = _SIMD_SUB(q5, y5);
+   q6 = _SIMD_SUB(q6, y6);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+   q6 = _SIMD_NFMA(w6, h4, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+ 
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+   q6 = _SIMD_NFMA(z6, h3, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+   _STORE(&q[(ldq*2)+2*offset],q3);
+   _STORE(&q[(ldq*2)+3*offset],q4);
+   _STORE(&q[(ldq*2)+4*offset],q5);
+   _STORE(&q[(ldq*2)+5*offset],q6);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q3 = _LOAD(&q[(ldq*3)+2*offset]);
+   q4 = _LOAD(&q[(ldq*3)+3*offset]);
+   q5 = _LOAD(&q[(ldq*3)+4*offset]);
+   q6 = _LOAD(&q[(ldq*3)+5*offset]);
+
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+   q3 = _SIMD_SUB(q3, x3);
+   q4 = _SIMD_SUB(q4, x4);
+   q5 = _SIMD_SUB(q5, x5);
+   q6 = _SIMD_SUB(q6, x6);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+   q6 = _SIMD_NFMA(w6, h4, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+   q6 = _SIMD_NFMA(y6, h2, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+   q6 = _SIMD_NFMA(z6, h3, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*3], q1);
+   _STORE(&q[(ldq*3)+offset], q2);
+   _STORE(&q[(ldq*3)+2*offset], q3);
+   _STORE(&q[(ldq*3)+3*offset], q4);
+   _STORE(&q[(ldq*3)+4*offset], q5);
+   _STORE(&q[(ldq*3)+5*offset], q6);
+
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[(ldq+offset)]);
+   q3 = _LOAD(&q[(ldq+2*offset)]);
+   q4 = _LOAD(&q[(ldq+3*offset)]);
+   q5 = _LOAD(&q[(ldq+4*offset)]);
+   q6 = _LOAD(&q[(ldq+5*offset)]);
+   q1 = _SIMD_SUB(q1, v1);
+   q2 = _SIMD_SUB(q2, v2);
+   q3 = _SIMD_SUB(q3, v3);
+   q4 = _SIMD_SUB(q4, v4);
+   q5 = _SIMD_SUB(q5, v5);
+   q6 = _SIMD_SUB(q6, v6);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+   q6 = _SIMD_NFMA(t6, h6, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(t6, h6));
+#endif
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[(ldq+offset)],q2);
+   _STORE(&q[(ldq+2*offset)],q3);
+   _STORE(&q[(ldq+3*offset)],q4);
+   _STORE(&q[(ldq+4*offset)],q5);
+   _STORE(&q[(ldq+5*offset)],q6);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q3 = _LOAD(&q[(ldq*2)+2*offset]);
+   q4 = _LOAD(&q[(ldq*2)+3*offset]);
+   q5 = _LOAD(&q[(ldq*2)+4*offset]);
+   q6 = _LOAD(&q[(ldq*2)+5*offset]);
+   q1 = _SIMD_SUB(q1, w1); 
+   q2 = _SIMD_SUB(q2, w2);
+   q3 = _SIMD_SUB(q3, w3);
+   q4 = _SIMD_SUB(q4, w4);
+   q5 = _SIMD_SUB(q5, w5);
+   q6 = _SIMD_SUB(q6, w6);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+   q6 = _SIMD_NFMA(v6, h5, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); 
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));  
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));  
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));  
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));  
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+   q6 = _SIMD_NFMA(t6, h6, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(t6, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+   _STORE(&q[(ldq*2)+2*offset],q3);
+   _STORE(&q[(ldq*2)+3*offset],q4);
+   _STORE(&q[(ldq*2)+4*offset],q5);
+   _STORE(&q[(ldq*2)+5*offset],q6);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q3 = _LOAD(&q[(ldq*3)+2*offset]);
+   q4 = _LOAD(&q[(ldq*3)+3*offset]);
+   q5 = _LOAD(&q[(ldq*3)+4*offset]);
+   q6 = _LOAD(&q[(ldq*3)+5*offset]);
+   q1 = _SIMD_SUB(q1, z1);
+   q2 = _SIMD_SUB(q2, z2);
+   q3 = _SIMD_SUB(q3, z3);
+   q4 = _SIMD_SUB(q4, z4);
+   q5 = _SIMD_SUB(q5, z5);
+   q6 = _SIMD_SUB(q6, z6);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+   q6 = _SIMD_NFMA(w6, h4, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+   q6 = _SIMD_NFMA(v6, h5, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+   q6 = _SIMD_NFMA(t6, h6, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(t6, h6));
+#endif
+
+   _STORE(&q[ldq*3],q1);
+   _STORE(&q[(ldq*3)+offset],q2);
+   _STORE(&q[(ldq*3)+2*offset],q3);
+   _STORE(&q[(ldq*3)+3*offset],q4);
+   _STORE(&q[(ldq*3)+4*offset],q5);
+   _STORE(&q[(ldq*3)+5*offset],q6);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*4]);
+   q2 = _LOAD(&q[(ldq*4)+offset]);
+   q3 = _LOAD(&q[(ldq*4)+2*offset]);
+   q4 = _LOAD(&q[(ldq*4)+3*offset]);
+   q5 = _LOAD(&q[(ldq*4)+4*offset]);
+   q6 = _LOAD(&q[(ldq*4)+5*offset]);
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+   q3 = _SIMD_SUB(q3, y3);
+   q4 = _SIMD_SUB(q4, y4);
+   q5 = _SIMD_SUB(q5, y5);
+   q6 = _SIMD_SUB(q6, y6);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+   q6 = _SIMD_NFMA(z6, h3, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+   q6 = _SIMD_NFMA(w6, h4, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+   q6 = _SIMD_NFMA(v6, h5, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+   q6 = _SIMD_NFMA(t6, h6, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(t6, h6));
+#endif
+
+   _STORE(&q[ldq*4],q1);
+   _STORE(&q[(ldq*4)+offset],q2);
+   _STORE(&q[(ldq*4)+2*offset],q3);
+   _STORE(&q[(ldq*4)+3*offset],q4);
+   _STORE(&q[(ldq*4)+4*offset],q5);
+   _STORE(&q[(ldq*4)+5*offset],q6);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[(ldh)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[(ldh)+1], hh[(ldh)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[(ldh)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*5]);
+   q2 = _LOAD(&q[(ldq*5)+offset]);
+   q3 = _LOAD(&q[(ldq*5)+2*offset]);
+   q4 = _LOAD(&q[(ldq*5)+3*offset]);
+   q5 = _LOAD(&q[(ldq*5)+4*offset]);
+   q6 = _LOAD(&q[(ldq*5)+5*offset]);
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+   q3 = _SIMD_SUB(q3, x3);
+   q4 = _SIMD_SUB(q4, x4);
+   q5 = _SIMD_SUB(q5, x5);
+   q6 = _SIMD_SUB(q6, x6);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+   q6 = _SIMD_NFMA(y6, h2, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+   q6 = _SIMD_NFMA(z6, h3, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+   q6 = _SIMD_NFMA(w6, h4, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+   q6 = _SIMD_NFMA(v6, h5, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+   q6 = _SIMD_NFMA(t6, h6, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(t6, h6));
+#endif
+
+   _STORE(&q[ldq*5],q1);
+   _STORE(&q[(ldq*5)+offset],q2);
+   _STORE(&q[(ldq*5)+2*offset],q3);
+   _STORE(&q[(ldq*5)+3*offset],q4);
+   _STORE(&q[(ldq*5)+4*offset],q5);
+   _STORE(&q[(ldq*5)+5*offset],q6);
+
+#endif /* BLOCK6 */
+
+   for (i = BLOCK; i < nb; i++)
+   {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+    h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif
+
+     q1 = _LOAD(&q[i*ldq]);
+     q2 = _LOAD(&q[(i*ldq)+offset]);
+     q3 = _LOAD(&q[(i*ldq)+2*offset]);
+     q4 = _LOAD(&q[(i*ldq)+3*offset]);
+     q5 = _LOAD(&q[(i*ldq)+4*offset]);
+     q6 = _LOAD(&q[(i*ldq)+5*offset]);
+
+#ifdef BLOCK2
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_FMA(x1, h1, q1);
+     q1 = _SIMD_FMA(y1, h2, q1);
+     q2 = _SIMD_FMA(x2, h1, q2);
+     q2 = _SIMD_FMA(y2, h2, q2);
+     q3 = _SIMD_FMA(x3, h1, q3);
+     q3 = _SIMD_FMA(y3, h2, q3);
+     q4 = _SIMD_FMA(x4, h1, q4);
+     q4 = _SIMD_FMA(y4, h2, q4);
+     q5 = _SIMD_FMA(x5, h1, q5);
+     q5 = _SIMD_FMA(y5, h2, q5);
+     q6 = _SIMD_FMA(x6, h1, q6);
+     q6 = _SIMD_FMA(y6, h2, q6);
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADD(_SIMD_MUL(x1,h1), _SIMD_MUL(y1, h2)));
+     q2 = _SIMD_ADD(q2, _SIMD_ADD(_SIMD_MUL(x2,h1), _SIMD_MUL(y2, h2)));
+     q3 = _SIMD_ADD(q3, _SIMD_ADD(_SIMD_MUL(x3,h1), _SIMD_MUL(y3, h2)));
+     q4 = _SIMD_ADD(q4, _SIMD_ADD(_SIMD_MUL(x4,h1), _SIMD_MUL(y4, h2)));
+     q5 = _SIMD_ADD(q5, _SIMD_ADD(_SIMD_MUL(x5,h1), _SIMD_MUL(y5, h2)));
+     q6 = _SIMD_ADD(q6, _SIMD_ADD(_SIMD_MUL(x6,h1), _SIMD_MUL(y6, h2)));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+               
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(x1, h1, q1);
+     q2 = _SIMD_NFMA(x2, h1, q2);
+     q3 = _SIMD_NFMA(x3, h1, q3);
+     q4 = _SIMD_NFMA(x4, h1, q4);
+     q5 = _SIMD_NFMA(x5, h1, q5);
+     q6 = _SIMD_NFMA(x6, h1, q6);
+#else  
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(x1,h1));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(x2,h1));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(x3,h1));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(x4,h1));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(x5,h1));
+     q6 = _SIMD_SUB(q6, _SIMD_MUL(x6,h1));
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(y1, h2, q1);
+     q2 = _SIMD_NFMA(y2, h2, q2);
+     q3 = _SIMD_NFMA(y3, h2, q3);
+     q4 = _SIMD_NFMA(y4, h2, q4);
+     q5 = _SIMD_NFMA(y5, h2, q5);
+     q6 = _SIMD_NFMA(y6, h2, q6);
+#else   
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(y1,h2));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(y2,h2));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(y3,h2));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(y4,h2));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(y5,h2));
+     q6 = _SIMD_SUB(q6, _SIMD_MUL(y6,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(z1, h3, q1);
+     q2 = _SIMD_NFMA(z2, h3, q2);
+     q3 = _SIMD_NFMA(z3, h3, q3);
+     q4 = _SIMD_NFMA(z4, h3, q4);
+     q5 = _SIMD_NFMA(z5, h3, q5);
+     q6 = _SIMD_NFMA(z6, h3, q6);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(z1,h3));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(z2,h3));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(z3,h3));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(z4,h3));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(z5,h3));
+     q6 = _SIMD_SUB(q6, _SIMD_MUL(z6,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); 
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(w1, h4, q1);
+     q2 = _SIMD_NFMA(w2, h4, q2);
+     q3 = _SIMD_NFMA(w3, h4, q3);
+     q4 = _SIMD_NFMA(w4, h4, q4);
+     q5 = _SIMD_NFMA(w5, h4, q5);
+     q6 = _SIMD_NFMA(w6, h4, q6);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(w1,h4));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(w2,h4));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(w3,h4));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(w4,h4));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(w5,h4));
+     q6 = _SIMD_SUB(q6, _SIMD_MUL(w6,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6  */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+
+#if VEC_SET == AVX_256
+     h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(v1, h5, q1);
+     q2 = _SIMD_NFMA(v2, h5, q2);
+     q3 = _SIMD_NFMA(v3, h5, q3);
+     q4 = _SIMD_NFMA(v4, h5, q4);
+     q5 = _SIMD_NFMA(v5, h5, q5);
+     q6 = _SIMD_NFMA(v6, h5, q6);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+     q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == AVX_256
+     h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(t1, h6, q1);
+     q2 = _SIMD_NFMA(t2, h6, q2);
+     q3 = _SIMD_NFMA(t3, h6, q3);
+     q4 = _SIMD_NFMA(t4, h6, q4);
+     q5 = _SIMD_NFMA(t5, h6, q5);
+     q6 = _SIMD_NFMA(t6, h6, q6);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+     q6 = _SIMD_SUB(q6, _SIMD_MUL(t6, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+     _STORE(&q[i*ldq],q1);
+     _STORE(&q[(i*ldq)+offset],q2);
+     _STORE(&q[(i*ldq)+2*offset],q3);
+     _STORE(&q[(i*ldq)+3*offset],q4);
+     _STORE(&q[(i*ldq)+4*offset],q5);
+     _STORE(&q[(i*ldq)+5*offset],q6);
+
+   }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+   q1 = _LOAD(&q[nb*ldq]);
+   q2 = _LOAD(&q[(nb*ldq)+offset]);
+   q3 = _LOAD(&q[(nb*ldq)+2*offset]);
+   q4 = _LOAD(&q[(nb*ldq)+3*offset]);
+   q5 = _LOAD(&q[(nb*ldq)+4*offset]);
+   q6 = _LOAD(&q[(nb*ldq)+5*offset]);
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_FMA(x1, h1, q1);
+   q2 = _SIMD_FMA(x2, h1, q2);
+   q3 = _SIMD_FMA(x3, h1, q3);
+   q4 = _SIMD_FMA(x4, h1, q4);
+   q5 = _SIMD_FMA(x5, h1, q5);
+   q6 = _SIMD_FMA(x6, h1, q6);
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_ADD(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_ADD(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_ADD(q5, _SIMD_MUL(x5, h1));
+   q6 = _SIMD_ADD(q6, _SIMD_MUL(x6, h1));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+   q6 = _SIMD_NFMA(x6, h1, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+   q6 = _SIMD_NFMA(y6, h2, q6);
+#else   
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+   q6 = _SIMD_NFMA(z6, h3, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
+#endif
+
+#endif /* BLOCK4 || BLOCK6  */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+   q6 = _SIMD_NFMA(w6, h4, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+   q6 = _SIMD_NFMA(v6, h5, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(v6, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[nb*ldq],q1);
+   _STORE(&q[(nb*ldq)+offset],q2);
+   _STORE(&q[(nb*ldq)+2*offset],q3);
+   _STORE(&q[(nb*ldq)+3*offset],q4);
+   _STORE(&q[(nb*ldq)+4*offset],q5);
+   _STORE(&q[(nb*ldq)+5*offset],q6);
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+1)*ldq]);
+   q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+   q5 = _LOAD(&q[((nb+1)*ldq)+4*offset]);
+   q6 = _LOAD(&q[((nb+1)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+   q6 = _SIMD_NFMA(x6, h1, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+   q6 = _SIMD_NFMA(y6, h2, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+   q6 = _SIMD_NFMA(z6, h3, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+   q6 = _SIMD_NFMA(w6, h4, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(w6, h4));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+1)*ldq],q1);
+   _STORE(&q[((nb+1)*ldq)+offset],q2);
+   _STORE(&q[((nb+1)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+1)*ldq)+3*offset],q4);
+   _STORE(&q[((nb+1)*ldq)+4*offset],q5);
+   _STORE(&q[((nb+1)*ldq)+5*offset],q6);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+2)*ldq]);
+   q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+   q5 = _LOAD(&q[((nb+2)*ldq)+4*offset]);
+   q6 = _LOAD(&q[((nb+2)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+   q6 = _SIMD_NFMA(x6, h1, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+   q6 = _SIMD_NFMA(y6, h2, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+   q6 = _SIMD_NFMA(z6, h3, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(z6, h3));
+#endif /* __ELPA_USE_FMA__ */
+ 
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+2)*ldq],q1);
+   _STORE(&q[((nb+2)*ldq)+offset],q2);
+   _STORE(&q[((nb+2)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+2)*ldq)+3*offset],q4);
+   _STORE(&q[((nb+2)*ldq)+4*offset],q5);
+   _STORE(&q[((nb+2)*ldq)+5*offset],q6);
+
+#endif /* BLOCK4 || BLOCK6  */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+   q1 = _LOAD(&q[(nb+3)*ldq]);
+   q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+3)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+3)*ldq)+3*offset]);
+   q5 = _LOAD(&q[((nb+3)*ldq)+4*offset]);
+   q6 = _LOAD(&q[((nb+3)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+   q6 = _SIMD_NFMA(x6, h1, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+   q6 = _SIMD_NFMA(y6, h2, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(y6, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+3)*ldq],q1);
+   _STORE(&q[((nb+3)*ldq)+offset],q2);
+   _STORE(&q[((nb+3)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+3)*ldq)+3*offset],q4);
+   _STORE(&q[((nb+3)*ldq)+4*offset],q5);
+   _STORE(&q[((nb+3)*ldq)+5*offset],q6);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+   q1 = _LOAD(&q[(nb+4)*ldq]);
+   q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+4)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+4)*ldq)+3*offset]);
+   q5 = _LOAD(&q[((nb+4)*ldq)+4*offset]);
+   q6 = _LOAD(&q[((nb+4)*ldq)+5*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+   q6 = _SIMD_NFMA(x6, h1, q6);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+   q6 = _SIMD_SUB(q6, _SIMD_MUL(x6, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+4)*ldq],q1);
+   _STORE(&q[((nb+4)*ldq)+offset],q2);
+   _STORE(&q[((nb+4)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+4)*ldq)+3*offset],q4);
+   _STORE(&q[((nb+4)*ldq)+4*offset],q5);
+   _STORE(&q[((nb+4)*ldq)+5*offset],q6);
+
+#endif /* BLOCK6 */
+}
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 10
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 20
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 20
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 40
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 40
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 80
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+/*
+ * Unrolled kernel that computes
+ * ROW_LENGTH rows of Q simultaneously, a
+ * matrix Vector product with two householder
+ */
+#ifdef BLOCK2
+/*
+ * vectors + a rank 2 update is performed
+ */
+#endif
+#ifdef BLOCK4
+/*
+ * vectors + a rank 1 update is performed
+ */
+#endif
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh, 
+#ifdef BLOCK2
+               DATA_TYPE s)
+#endif
+#ifdef BLOCK4
+               DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4)
+#endif
+#ifdef BLOCK6
+               DATA_TYPE_PTR scalarprods)
+#endif
+  {
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ ROW_LENGTH x nb+1] * hh
+    // hh contains two householder vectors, with offset 1
+    /////////////////////////////////////////////////////
+#endif
+#if defined(BLOCK4) || defined(BLOCK6)
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ ROW_LENGTH x nb+3] * hh
+    // hh contains four householder vectors
+    /////////////////////////////////////////////////////
+#endif
+
+    int i;
+#ifdef BLOCK2
+#if VEC_SET == SSE_128
+    // Needed bit mask for floating point sign flip
+#ifdef DOUBLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == VSX_SSE
+    __SIMD_DATATYPE sign = vec_splats(-1.0);
+#endif
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+    __SIMD_DATATYPE x1 = _LOAD(&q[ldq]);
+    __SIMD_DATATYPE x2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE x3 = _LOAD(&q[ldq+2*offset]);
+    __SIMD_DATATYPE x4 = _LOAD(&q[ldq+3*offset]);
+    __SIMD_DATATYPE x5 = _LOAD(&q[ldq+4*offset]);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h1 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+    __SIMD_DATATYPE h2;
+
+#ifdef __ELPA_USE_FMA__
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_FMA(x1, h1, q1);
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE y2 = _SIMD_FMA(x2, h1, q2);
+    __SIMD_DATATYPE q3 = _LOAD(&q[2*offset]);
+    __SIMD_DATATYPE y3 = _SIMD_FMA(x3, h1, q3);
+    __SIMD_DATATYPE q4 = _LOAD(&q[3*offset]);
+    __SIMD_DATATYPE y4 = _SIMD_FMA(x4, h1, q4);
+    __SIMD_DATATYPE q5 = _LOAD(&q[4*offset]);
+    __SIMD_DATATYPE y5 = _SIMD_FMA(x5, h1, q5);
+#else
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE y2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+    __SIMD_DATATYPE q3 = _LOAD(&q[2*offset]);
+    __SIMD_DATATYPE y3 = _SIMD_ADD(q3, _SIMD_MUL(x3, h1));
+    __SIMD_DATATYPE q4 = _LOAD(&q[3*offset]);
+    __SIMD_DATATYPE y4 = _SIMD_ADD(q4, _SIMD_MUL(x4, h1));
+    __SIMD_DATATYPE q5 = _LOAD(&q[4*offset]);
+    __SIMD_DATATYPE y5 = _SIMD_ADD(q5, _SIMD_MUL(x5, h1));
+#endif
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+    register __SIMD_DATATYPE x1 = a1_1;
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));                          
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));                          
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));                          
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1));
+    register __SIMD_DATATYPE x1 = a1_1; 
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*3)+offset]);                  
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[0+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+    register __SIMD_DATATYPE x2 = a1_2;
+#else
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+    register __SIMD_DATATYPE x2 = a1_2;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_3 = _LOAD(&q[(ldq*3)+2*offset]);
+    __SIMD_DATATYPE a2_3 = _LOAD(&q[(ldq*2)+2*offset]);
+    __SIMD_DATATYPE a3_3 = _LOAD(&q[ldq+2*offset]);
+    __SIMD_DATATYPE a4_3 = _LOAD(&q[0+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
+    w3 = _SIMD_FMA(a2_3, h_4_2, w3);
+    w3 = _SIMD_FMA(a1_3, h_4_1, w3);
+    register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
+    z3 = _SIMD_FMA(a1_3, h_3_1, z3);
+    register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
+    register __SIMD_DATATYPE x3 = a1_3;
+#else
+    register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
+    register __SIMD_DATATYPE z3 = _SIMD_ADD(a3_3, _SIMD_MUL(a2_3, h_3_2));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
+    register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
+    register __SIMD_DATATYPE x3 = a1_3;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_4 = _LOAD(&q[(ldq*3)+3*offset]);
+    __SIMD_DATATYPE a2_4 = _LOAD(&q[(ldq*2)+3*offset]);
+    __SIMD_DATATYPE a3_4 = _LOAD(&q[ldq+3*offset]);
+    __SIMD_DATATYPE a4_4 = _LOAD(&q[0+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w4 = _SIMD_FMA(a3_4, h_4_3, a4_4);
+    w4 = _SIMD_FMA(a2_4, h_4_2, w4);
+    w4 = _SIMD_FMA(a1_4, h_4_1, w4);
+    register __SIMD_DATATYPE z4 = _SIMD_FMA(a2_4, h_3_2, a3_4);
+    z4 = _SIMD_FMA(a1_4, h_3_1, z4);
+    register __SIMD_DATATYPE y4 = _SIMD_FMA(a1_4, h_2_1, a2_4);
+    register __SIMD_DATATYPE x4 = a1_4;
+#else
+    register __SIMD_DATATYPE w4 = _SIMD_ADD(a4_4, _SIMD_MUL(a3_4, h_4_3));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a2_4, h_4_2));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a1_4, h_4_1));
+    register __SIMD_DATATYPE z4 = _SIMD_ADD(a3_4, _SIMD_MUL(a2_4, h_3_2));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(a1_4, h_3_1));
+    register __SIMD_DATATYPE y4 = _SIMD_ADD(a2_4, _SIMD_MUL(a1_4, h_2_1));
+    register __SIMD_DATATYPE x4 = a1_4;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_5 = _LOAD(&q[(ldq*3)+4*offset]);
+    __SIMD_DATATYPE a2_5 = _LOAD(&q[(ldq*2)+4*offset]);
+    __SIMD_DATATYPE a3_5 = _LOAD(&q[ldq+4*offset]);
+    __SIMD_DATATYPE a4_5 = _LOAD(&q[0+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w5 = _SIMD_FMA(a3_5, h_4_3, a4_5);
+    w5 = _SIMD_FMA(a2_5, h_4_2, w5);
+    w5 = _SIMD_FMA(a1_5, h_4_1, w5);
+    register __SIMD_DATATYPE z5 = _SIMD_FMA(a2_5, h_3_2, a3_5);
+    z5 = _SIMD_FMA(a1_5, h_3_1, z5);
+    register __SIMD_DATATYPE y5 = _SIMD_FMA(a1_5, h_2_1, a2_5);
+    register __SIMD_DATATYPE x5 = a1_5;
+#else
+    register __SIMD_DATATYPE w5 = _SIMD_ADD(a4_5, _SIMD_MUL(a3_5, h_4_3));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(a2_5, h_4_2));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(a1_5, h_4_1));
+    register __SIMD_DATATYPE z5 = _SIMD_ADD(a3_5, _SIMD_MUL(a2_5, h_3_2));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(a1_5, h_3_1));
+    register __SIMD_DATATYPE y5 = _SIMD_ADD(a2_5, _SIMD_MUL(a1_5, h_2_1));
+    register __SIMD_DATATYPE x5 = a1_5;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+    __SIMD_DATATYPE q3;
+    __SIMD_DATATYPE q4;
+    __SIMD_DATATYPE q5;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+    
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*5]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*4]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a6_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET1(hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_6_5 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t1 = _SIMD_FMA(a5_1, h_6_5, a6_1);
+    t1 = _SIMD_FMA(a4_1, h_6_4, t1);
+    t1 = _SIMD_FMA(a3_1, h_6_3, t1);
+    t1 = _SIMD_FMA(a2_1, h_6_2, t1);
+    t1 = _SIMD_FMA(a1_1, h_6_1, t1);
+#else
+    register __SIMD_DATATYPE t1 = _SIMD_ADD(a6_1, _SIMD_MUL(a5_1, h_6_5)); 
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a4_1, h_6_4));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a3_1, h_6_3));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a2_1, h_6_2));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_5_4 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE v1 = _SIMD_FMA(a4_1, h_5_4, a5_1);
+    v1 = _SIMD_FMA(a3_1, h_5_3, v1);
+    v1 = _SIMD_FMA(a2_1, h_5_2, v1);
+    v1 = _SIMD_FMA(a1_1, h_5_1, v1);
+#else
+    register __SIMD_DATATYPE v1 = _SIMD_ADD(a5_1, _SIMD_MUL(a4_1, h_5_4)); 
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a3_1, h_5_3));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a2_1, h_5_2));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3)); 
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+#else
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1)); 
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x1 = a1_1;
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*5)+offset]);
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*4)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[(ldq*3)+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a5_2 = _LOAD(&q[(ldq)+offset]);
+    __SIMD_DATATYPE a6_2 = _LOAD(&q[offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t2 = _SIMD_FMA(a5_2, h_6_5, a6_2);
+    t2 = _SIMD_FMA(a4_2, h_6_4, t2);
+    t2 = _SIMD_FMA(a3_2, h_6_3, t2);
+    t2 = _SIMD_FMA(a2_2, h_6_2, t2);
+    t2 = _SIMD_FMA(a1_2, h_6_1, t2);
+    register __SIMD_DATATYPE v2 = _SIMD_FMA(a4_2, h_5_4, a5_2);
+    v2 = _SIMD_FMA(a3_2, h_5_3, v2);
+    v2 = _SIMD_FMA(a2_2, h_5_2, v2);
+    v2 = _SIMD_FMA(a1_2, h_5_1, v2);
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+#else
+    register __SIMD_DATATYPE t2 = _SIMD_ADD(a6_2, _SIMD_MUL(a5_2, h_6_5));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a4_2, h_6_4));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a3_2, h_6_3));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a2_2, h_6_2));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a1_2, h_6_1));
+    register __SIMD_DATATYPE v2 = _SIMD_ADD(a5_2, _SIMD_MUL(a4_2, h_5_4));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a3_2, h_5_3));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a2_2, h_5_2));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a1_2, h_5_1));
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x2 = a1_2;
+
+    __SIMD_DATATYPE a1_3 = _LOAD(&q[(ldq*5)+2*offset]);
+    __SIMD_DATATYPE a2_3 = _LOAD(&q[(ldq*4)+2*offset]);
+    __SIMD_DATATYPE a3_3 = _LOAD(&q[(ldq*3)+2*offset]);
+    __SIMD_DATATYPE a4_3 = _LOAD(&q[(ldq*2)+2*offset]);
+    __SIMD_DATATYPE a5_3 = _LOAD(&q[(ldq)+2*offset]);
+    __SIMD_DATATYPE a6_3 = _LOAD(&q[2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t3 = _SIMD_FMA(a5_3, h_6_5, a6_3);
+    t3 = _SIMD_FMA(a4_3, h_6_4, t3);
+    t3 = _SIMD_FMA(a3_3, h_6_3, t3);
+    t3 = _SIMD_FMA(a2_3, h_6_2, t3);
+    t3 = _SIMD_FMA(a1_3, h_6_1, t3);
+    register __SIMD_DATATYPE v3 = _SIMD_FMA(a4_3, h_5_4, a5_3);
+    v3 = _SIMD_FMA(a3_3, h_5_3, v3);
+    v3 = _SIMD_FMA(a2_3, h_5_2, v3);
+    v3 = _SIMD_FMA(a1_3, h_5_1, v3);
+    register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
+    w3 = _SIMD_FMA(a2_3, h_4_2, w3);
+    w3 = _SIMD_FMA(a1_3, h_4_1, w3);
+    register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
+    z3 = _SIMD_FMA(a1_3, h_3_1, z3);
+    register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
+#else
+    register __SIMD_DATATYPE t3 = _SIMD_ADD(a6_3, _SIMD_MUL(a5_3, h_6_5));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a4_3, h_6_4));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a3_3, h_6_3));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a2_3, h_6_2));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a1_3, h_6_1));
+    register __SIMD_DATATYPE v3 = _SIMD_ADD(a5_3, _SIMD_MUL(a4_3, h_5_4));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a3_3, h_5_3));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a2_3, h_5_2));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a1_3, h_5_1));
+    register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
+    register __SIMD_DATATYPE z3 = _SIMD_ADD(a3_3, _SIMD_MUL(a2_3, h_3_2));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
+    register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x3 = a1_3;
+
+    __SIMD_DATATYPE a1_4 = _LOAD(&q[(ldq*5)+3*offset]);
+    __SIMD_DATATYPE a2_4 = _LOAD(&q[(ldq*4)+3*offset]);
+    __SIMD_DATATYPE a3_4 = _LOAD(&q[(ldq*3)+3*offset]);
+    __SIMD_DATATYPE a4_4 = _LOAD(&q[(ldq*2)+3*offset]);
+    __SIMD_DATATYPE a5_4 = _LOAD(&q[(ldq)+3*offset]);
+    __SIMD_DATATYPE a6_4 = _LOAD(&q[3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t4 = _SIMD_FMA(a5_4, h_6_5, a6_4);
+    t4 = _SIMD_FMA(a4_4, h_6_4, t4);
+    t4 = _SIMD_FMA(a3_4, h_6_3, t4);
+    t4 = _SIMD_FMA(a2_4, h_6_2, t4);
+    t4 = _SIMD_FMA(a1_4, h_6_1, t4);
+    register __SIMD_DATATYPE v4 = _SIMD_FMA(a4_4, h_5_4, a5_4);
+    v4 = _SIMD_FMA(a3_4, h_5_3, v4);
+    v4 = _SIMD_FMA(a2_4, h_5_2, v4);
+    v4 = _SIMD_FMA(a1_4, h_5_1, v4);
+    register __SIMD_DATATYPE w4 = _SIMD_FMA(a3_4, h_4_3, a4_4);
+    w4 = _SIMD_FMA(a2_4, h_4_2, w4);
+    w4 = _SIMD_FMA(a1_4, h_4_1, w4);
+    register __SIMD_DATATYPE z4 = _SIMD_FMA(a2_4, h_3_2, a3_4);
+    z4 = _SIMD_FMA(a1_4, h_3_1, z4);
+    register __SIMD_DATATYPE y4 = _SIMD_FMA(a1_4, h_2_1, a2_4);
+#else
+    register __SIMD_DATATYPE t4 = _SIMD_ADD(a6_4, _SIMD_MUL(a5_4, h_6_5));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a4_4, h_6_4));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a3_4, h_6_3));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a2_4, h_6_2));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a1_4, h_6_1));
+    register __SIMD_DATATYPE v4 = _SIMD_ADD(a5_4, _SIMD_MUL(a4_4, h_5_4));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a3_4, h_5_3));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a2_4, h_5_2));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a1_4, h_5_1));
+    register __SIMD_DATATYPE w4 = _SIMD_ADD(a4_4, _SIMD_MUL(a3_4, h_4_3));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a2_4, h_4_2));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a1_4, h_4_1));
+    register __SIMD_DATATYPE z4 = _SIMD_ADD(a3_4, _SIMD_MUL(a2_4, h_3_2));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(a1_4, h_3_1));
+    register __SIMD_DATATYPE y4 = _SIMD_ADD(a2_4, _SIMD_MUL(a1_4, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x4 = a1_4;
+
+    __SIMD_DATATYPE a1_5 = _LOAD(&q[(ldq*5)+4*offset]);
+    __SIMD_DATATYPE a2_5 = _LOAD(&q[(ldq*4)+4*offset]);
+    __SIMD_DATATYPE a3_5 = _LOAD(&q[(ldq*3)+4*offset]);
+    __SIMD_DATATYPE a4_5 = _LOAD(&q[(ldq*2)+4*offset]);
+    __SIMD_DATATYPE a5_5 = _LOAD(&q[(ldq)+4*offset]);
+    __SIMD_DATATYPE a6_5 = _LOAD(&q[4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t5 = _SIMD_FMA(a5_5, h_6_5, a6_5);
+    t5 = _SIMD_FMA(a4_5, h_6_4, t5);
+    t5 = _SIMD_FMA(a3_5, h_6_3, t5);
+    t5 = _SIMD_FMA(a2_5, h_6_2, t5);
+    t5 = _SIMD_FMA(a1_5, h_6_1, t5);
+    register __SIMD_DATATYPE v5 = _SIMD_FMA(a4_5, h_5_4, a5_5);
+    v5 = _SIMD_FMA(a3_5, h_5_3, v5);
+    v5 = _SIMD_FMA(a2_5, h_5_2, v5);
+    v5 = _SIMD_FMA(a1_5, h_5_1, v5);
+    register __SIMD_DATATYPE w5 = _SIMD_FMA(a3_5, h_4_3, a4_5);
+    w5 = _SIMD_FMA(a2_5, h_4_2, w5);
+    w5 = _SIMD_FMA(a1_5, h_4_1, w5);
+    register __SIMD_DATATYPE z5 = _SIMD_FMA(a2_5, h_3_2, a3_5);
+    z5 = _SIMD_FMA(a1_5, h_3_1, z5);
+    register __SIMD_DATATYPE y5 = _SIMD_FMA(a1_5, h_2_1, a2_5);
+#else
+    register __SIMD_DATATYPE t5 = _SIMD_ADD(a6_5, _SIMD_MUL(a5_5, h_6_5));
+    t5 = _SIMD_ADD(t5, _SIMD_MUL(a4_5, h_6_4));
+    t5 = _SIMD_ADD(t5, _SIMD_MUL(a3_5, h_6_3));
+    t5 = _SIMD_ADD(t5, _SIMD_MUL(a2_5, h_6_2));
+    t5 = _SIMD_ADD(t5, _SIMD_MUL(a1_5, h_6_1));
+    register __SIMD_DATATYPE v5 = _SIMD_ADD(a5_5, _SIMD_MUL(a4_5, h_5_4));
+    v5 = _SIMD_ADD(v5, _SIMD_MUL(a3_5, h_5_3));
+    v5 = _SIMD_ADD(v5, _SIMD_MUL(a2_5, h_5_2));
+    v5 = _SIMD_ADD(v5, _SIMD_MUL(a1_5, h_5_1));
+    register __SIMD_DATATYPE w5 = _SIMD_ADD(a4_5, _SIMD_MUL(a3_5, h_4_3));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(a2_5, h_4_2));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(a1_5, h_4_1));
+    register __SIMD_DATATYPE z5 = _SIMD_ADD(a3_5, _SIMD_MUL(a2_5, h_3_2));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(a1_5, h_3_1));
+    register __SIMD_DATATYPE y5 = _SIMD_ADD(a2_5, _SIMD_MUL(a1_5, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x5 = a1_5;
+
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+    __SIMD_DATATYPE q3;
+    __SIMD_DATATYPE q4;
+    __SIMD_DATATYPE q5;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+    __SIMD_DATATYPE h5;
+    __SIMD_DATATYPE h6;
+
+#endif /* BLOCK6 */
+
+
+    for(i = BLOCK; i < nb; i++)
+      {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if  VEC_SET == AVX_256
+        h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+        h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif /*   VEC_SET == AVX_256 */
+
+        q1 = _LOAD(&q[i*ldq]);
+        q2 = _LOAD(&q[(i*ldq)+offset]);
+        q3 = _LOAD(&q[(i*ldq)+2*offset]);
+        q4 = _LOAD(&q[(i*ldq)+3*offset]);
+        q5 = _LOAD(&q[(i*ldq)+4*offset]);
+#ifdef __ELPA_USE_FMA__
+        x1 = _SIMD_FMA(q1, h1, x1);
+        y1 = _SIMD_FMA(q1, h2, y1);
+        x2 = _SIMD_FMA(q2, h1, x2);
+        y2 = _SIMD_FMA(q2, h2, y2);
+        x3 = _SIMD_FMA(q3, h1, x3);
+        y3 = _SIMD_FMA(q3, h2, y3);
+        x4 = _SIMD_FMA(q4, h1, x4);
+        y4 = _SIMD_FMA(q4, h2, y4);
+        x5 = _SIMD_FMA(q5, h1, x5);
+        y5 = _SIMD_FMA(q5, h2, y5);
+#else
+        x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+        y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+        x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+        y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+        x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+        y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+        x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+        y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+        x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+        y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+#endif
+
+#if defined(BLOCK4) || defined(BLOCK6)
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        z1 = _SIMD_FMA(q1, h3, z1);
+        z2 = _SIMD_FMA(q2, h3, z2);
+        z3 = _SIMD_FMA(q3, h3, z3);
+        z4 = _SIMD_FMA(q4, h3, z4);
+        z5 = _SIMD_FMA(q5, h3, z5);
+#else
+        z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+        z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+        z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+        z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+        z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        w1 = _SIMD_FMA(q1, h4, w1);
+        w2 = _SIMD_FMA(q2, h4, w2);
+        w3 = _SIMD_FMA(q3, h4, w3);
+        w4 = _SIMD_FMA(q4, h4, w4);
+        w5 = _SIMD_FMA(q5, h4, w5);
+#else
+        w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+        w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+        w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+        w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+        w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
+#endif /* __ELPA_USE_FMA__ */
+			
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == AVX_256
+        h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        v1 = _SIMD_FMA(q1, h5, v1);
+        v2 = _SIMD_FMA(q2, h5, v2);
+        v3 = _SIMD_FMA(q3, h5, v3);
+        v4 = _SIMD_FMA(q4, h5, v4);
+        v5 = _SIMD_FMA(q5, h5, v5);
+#else
+        v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+        v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+        v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5));
+        v4 = _SIMD_ADD(v4, _SIMD_MUL(q4,h5));
+        v5 = _SIMD_ADD(v5, _SIMD_MUL(q5,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+
+
+#if VEC_SET == AVX_256
+        h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        t1 = _SIMD_FMA(q1, h6, t1);
+        t2 = _SIMD_FMA(q2, h6, t2);
+        t3 = _SIMD_FMA(q3, h6, t3);
+        t4 = _SIMD_FMA(q4, h6, t4);
+        t5 = _SIMD_FMA(q5, h6, t5);
+#else
+        t1 = _SIMD_ADD(t1, _SIMD_MUL(q1,h6));
+        t2 = _SIMD_ADD(t2, _SIMD_MUL(q2,h6));
+        t3 = _SIMD_ADD(t3, _SIMD_MUL(q3,h6));
+        t4 = _SIMD_ADD(t4, _SIMD_MUL(q4,h6));
+        t5 = _SIMD_ADD(t5, _SIMD_MUL(q5,h6));
+#endif /* __ELPA_USE_FMA__ */	
+
+#endif /* BLOCK6 */
+      }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif 
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+    q1 = _LOAD(&q[nb*ldq]);
+    q2 = _LOAD(&q[(nb*ldq)+offset]);
+    q3 = _LOAD(&q[(nb*ldq)+2*offset]);
+    q4 = _LOAD(&q[(nb*ldq)+3*offset]);
+    q5 = _LOAD(&q[(nb*ldq)+4*offset]);
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+    
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+    z5 = _SIMD_FMA(q5, h3, z5);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
+#endif
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+1)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[(ldh*1)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+2)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 */
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+    w3 = _SIMD_FMA(q3, h4, w3);
+    w4 = _SIMD_FMA(q4, h4, w4);
+    w5 = _SIMD_FMA(q5, h4, w5);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); 
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    v1 = _SIMD_FMA(q1, h5, v1);
+    v2 = _SIMD_FMA(q2, h5, v2);
+    v3 = _SIMD_FMA(q3, h5, v3);
+    v4 = _SIMD_FMA(q4, h5, v4);
+    v5 = _SIMD_FMA(q5, h5, v5);
+#else
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(q4,h5));
+    v5 = _SIMD_ADD(v5, _SIMD_MUL(q5,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-4], hh[nb-4]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+1)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-3], hh[ldh+nb-3]);
+#endif
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+    z5 = _SIMD_FMA(q5, h3, z5);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+    w3 = _SIMD_FMA(q3, h4, w3);
+    w4 = _SIMD_FMA(q4, h4, w4);
+    w5 = _SIMD_FMA(q5, h4, w5);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+    w5 = _SIMD_ADD(w5, _SIMD_MUL(q5,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-3], hh[nb-3]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-3]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+2)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+#endif /* __ELPA_USE_FMA__ */
+ 
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+    z5 = _SIMD_FMA(q5, h3, z5);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+    z5 = _SIMD_ADD(z5, _SIMD_MUL(q5,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-4)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+3)*ldq]);
+    q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+3)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+3)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+3)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+    y5 = _SIMD_FMA(q5, h2, y5);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+    y5 = _SIMD_ADD(y5, _SIMD_MUL(q5,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-5)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+4)*ldq]);
+    q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+4)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+4)*ldq)+3*offset]);
+    q5 = _LOAD(&q[((nb+4)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+    x5 = _SIMD_FMA(q5, h1, x5);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+    x5 = _SIMD_ADD(x5, _SIMD_MUL(q5,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Rank-2 update of Q [ROW_LENGTH x nb+1]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK4
+    /////////////////////////////////////////////////////
+    // Rank-1 update of Q [ROW_LENGTH x nb+3]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK6
+    /////////////////////////////////////////////////////
+    // Apply tau, correct wrong calculation using pre-calculated scalar products
+    /////////////////////////////////////////////////////
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]);
+
+    __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET1(hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET1(hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET1(hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET1(hh[ldh*5]);       
+#endif
+
+#ifdef BLOCK2    
+    __SIMD_DATATYPE vs = _SIMD_SET1(s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(s_1_3);  
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(s_1_4);  
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(s_2_4);  
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET1(scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET1(scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET1(scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET1(scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET1(scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET1(scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET1(scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET1(scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET1(scalarprods[14]);
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE */
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE tau1 = _SIMD_SET(hh[0], hh[0]);
+
+    __SIMD_DATATYPE tau2 = _SIMD_SET(hh[ldh], hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET(hh[ldh*2], hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET(hh[ldh*3], hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET(hh[ldh*4], hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET(hh[ldh*5], hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2
+    __SIMD_DATATYPE vs = _SIMD_SET(s, s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(s_1_2, s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(s_1_3, s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(s_2_3, s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(s_1_4, s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(s_2_4, s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(s_3_4, s_3_4);
+
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(scalarprods[0], scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(scalarprods[1], scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(scalarprods[2], scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(scalarprods[3], scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(scalarprods[4], scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(scalarprods[5], scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET(scalarprods[6], scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET(scalarprods[7], scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET(scalarprods[8], scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET(scalarprods[9], scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET(scalarprods[10], scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET(scalarprods[11], scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET(scalarprods[12], scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET(scalarprods[13], scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET(scalarprods[14], scalarprods[14]);
+#endif
+#endif /*  VEC_SET == SPARC64_SSE  */
+
+#if VEC_SET == AVX_256
+   __SIMD_DATATYPE tau1 = _SIMD_BROADCAST(hh);
+   __SIMD_DATATYPE tau2 = _SIMD_BROADCAST(&hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_BROADCAST(&hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_BROADCAST(&hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_BROADCAST(&hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_BROADCAST(&hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2  
+   __SIMD_DATATYPE vs = _SIMD_BROADCAST(&s);
+#endif
+
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_BROADCAST(&scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_BROADCAST(&scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_BROADCAST(&scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_BROADCAST(&scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_BROADCAST(&scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_BROADCAST(&scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_BROADCAST(&scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_BROADCAST(&scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_BROADCAST(&scalarprods[14]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+    h1 = _XOR(tau1, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_NEG(tau1);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau1, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau1;
+#endif
+
+   x1 = _SIMD_MUL(x1, h1);
+   x2 = _SIMD_MUL(x2, h1);
+   x3 = _SIMD_MUL(x3, h1);
+   x4 = _SIMD_MUL(x4, h1);
+   x5 = _SIMD_MUL(x5, h1);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+   h1 = _XOR(tau2, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_NEG(tau2);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau2, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+   h2 = _SIMD_MUL(h1, vs);
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau2;
+   h2 = _SIMD_MUL(h1, vs_1_2);
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMA(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMA(y2, h1, _SIMD_MUL(x2,h2));
+   y3 = _SIMD_FMA(y3, h1, _SIMD_MUL(x3,h2));
+   y4 = _SIMD_FMA(y4, h1, _SIMD_MUL(x4,h2));
+   y5 = _SIMD_FMA(y5, h1, _SIMD_MUL(x5,h2));
+#else
+   y1 = _SIMD_ADD(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_ADD(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+   y3 = _SIMD_ADD(_SIMD_MUL(y3,h1), _SIMD_MUL(x3,h2));
+   y4 = _SIMD_ADD(_SIMD_MUL(y4,h1), _SIMD_MUL(x4,h2));
+   y5 = _SIMD_ADD(_SIMD_MUL(y5,h1), _SIMD_MUL(x5,h2));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMSUB(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMSUB(y2, h1, _SIMD_MUL(x2,h2));
+   y3 = _SIMD_FMSUB(y3, h1, _SIMD_MUL(x3,h2));
+   y4 = _SIMD_FMSUB(y4, h1, _SIMD_MUL(x4,h2));
+   y5 = _SIMD_FMSUB(y5, h1, _SIMD_MUL(x5,h2));
+#else   
+   y1 = _SIMD_SUB(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_SUB(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+   y3 = _SIMD_SUB(_SIMD_MUL(y3,h1), _SIMD_MUL(x3,h2));
+   y4 = _SIMD_SUB(_SIMD_MUL(y4,h1), _SIMD_MUL(x4,h2));
+   y5 = _SIMD_SUB(_SIMD_MUL(y5,h1), _SIMD_MUL(x5,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau3;
+   h2 = _SIMD_MUL(h1, vs_1_3);
+   h3 = _SIMD_MUL(h1, vs_2_3);
+
+#ifdef __ELPA_USE_FMA__
+   z1 = _SIMD_FMSUB(z1, h1, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_FMSUB(z2, h1, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)));
+   z3 = _SIMD_FMSUB(z3, h1, _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2)));
+   z4 = _SIMD_FMSUB(z4, h1, _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2)));
+   z5 = _SIMD_FMSUB(z5, h1, _SIMD_FMA(y5, h3, _SIMD_MUL(x5,h2)));
+#else
+   z1 = _SIMD_SUB(_SIMD_MUL(z1,h1), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_SUB(_SIMD_MUL(z2,h1), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)));
+   z3 = _SIMD_SUB(_SIMD_MUL(z3,h1), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2)));
+   z4 = _SIMD_SUB(_SIMD_MUL(z4,h1), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2)));
+   z5 = _SIMD_SUB(_SIMD_MUL(z5,h1), _SIMD_ADD(_SIMD_MUL(y5,h3), _SIMD_MUL(x5,h2)));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau4;
+   h2 = _SIMD_MUL(h1, vs_1_4);
+   h3 = _SIMD_MUL(h1, vs_2_4);
+   h4 = _SIMD_MUL(h1, vs_3_4);
+
+#ifdef __ELPA_USE_FMA__
+   w1 = _SIMD_FMSUB(w1, h1, _SIMD_FMA(z1, h4, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   w2 = _SIMD_FMSUB(w2, h1, _SIMD_FMA(z2, h4, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+   w3 = _SIMD_FMSUB(w3, h1, _SIMD_FMA(z3, h4, _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2))));
+   w4 = _SIMD_FMSUB(w4, h1, _SIMD_FMA(z4, h4, _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2))));
+   w5 = _SIMD_FMSUB(w5, h1, _SIMD_FMA(z5, h4, _SIMD_FMA(y5, h3, _SIMD_MUL(x5,h2))));
+#else
+   w1 = _SIMD_SUB(_SIMD_MUL(w1,h1), _SIMD_ADD(_SIMD_MUL(z1,h4), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+   w2 = _SIMD_SUB(_SIMD_MUL(w2,h1), _SIMD_ADD(_SIMD_MUL(z2,h4), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+   w3 = _SIMD_SUB(_SIMD_MUL(w3,h1), _SIMD_ADD(_SIMD_MUL(z3,h4), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2))));
+   w4 = _SIMD_SUB(_SIMD_MUL(w4,h1), _SIMD_ADD(_SIMD_MUL(z4,h4), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2))));
+   w5 = _SIMD_SUB(_SIMD_MUL(w5,h1), _SIMD_ADD(_SIMD_MUL(z5,h4), _SIMD_ADD(_SIMD_MUL(y5,h3), _SIMD_MUL(x5,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+   h2 = _SIMD_MUL(tau5, vs_1_5); 
+   h3 = _SIMD_MUL(tau5, vs_2_5);
+   h4 = _SIMD_MUL(tau5, vs_3_5);
+   h5 = _SIMD_MUL(tau5, vs_4_5);
+
+#ifdef __ELPA_USE_FMA__
+   v1 = _SIMD_FMSUB(v1, tau5, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_FMSUB(v2, tau5, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+   v3 = _SIMD_FMSUB(v3, tau5, _SIMD_ADD(_SIMD_FMA(w3, h5, _SIMD_MUL(z3,h4)), _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2))));
+   v4 = _SIMD_FMSUB(v4, tau5, _SIMD_ADD(_SIMD_FMA(w4, h5, _SIMD_MUL(z4,h4)), _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2))));
+   v5 = _SIMD_FMSUB(v5, tau5, _SIMD_ADD(_SIMD_FMA(w5, h5, _SIMD_MUL(z5,h4)), _SIMD_FMA(y5, h3, _SIMD_MUL(x5,h2))));
+#else
+   v1 = _SIMD_SUB(_SIMD_MUL(v1,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_SUB(_SIMD_MUL(v2,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+   v3 = _SIMD_SUB(_SIMD_MUL(v3,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w3,h5), _SIMD_MUL(z3,h4)), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2))));
+   v4 = _SIMD_SUB(_SIMD_MUL(v4,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w4,h5), _SIMD_MUL(z4,h4)), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2))));
+   v5 = _SIMD_SUB(_SIMD_MUL(v5,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w5,h5), _SIMD_MUL(z5,h4)), _SIMD_ADD(_SIMD_MUL(y5,h3), _SIMD_MUL(x5,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+   h2 = _SIMD_MUL(tau6, vs_1_6);
+   h3 = _SIMD_MUL(tau6, vs_2_6);
+   h4 = _SIMD_MUL(tau6, vs_3_6);
+   h5 = _SIMD_MUL(tau6, vs_4_6);
+   h6 = _SIMD_MUL(tau6, vs_5_6);
+
+#ifdef __ELPA_USE_FMA__
+   t1 = _SIMD_FMSUB(t1, tau6, _SIMD_FMA(v1, h6, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_FMSUB(t2, tau6, _SIMD_FMA(v2, h6, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)))));
+   t3 = _SIMD_FMSUB(t3, tau6, _SIMD_FMA(v3, h6, _SIMD_ADD(_SIMD_FMA(w3, h5, _SIMD_MUL(z3,h4)), _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2)))));
+   t4 = _SIMD_FMSUB(t4, tau6, _SIMD_FMA(v4, h6, _SIMD_ADD(_SIMD_FMA(w4, h5, _SIMD_MUL(z4,h4)), _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2)))));
+   t5 = _SIMD_FMSUB(t5, tau6, _SIMD_FMA(v5, h6, _SIMD_ADD(_SIMD_FMA(w5, h5, _SIMD_MUL(z5,h4)), _SIMD_FMA(y5, h3, _SIMD_MUL(x5,h2)))));
+#else
+   t1 = _SIMD_SUB(_SIMD_MUL(t1,tau6), _SIMD_ADD( _SIMD_MUL(v1,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_SUB(_SIMD_MUL(t2,tau6), _SIMD_ADD( _SIMD_MUL(v2,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)))));
+   t3 = _SIMD_SUB(_SIMD_MUL(t3,tau6), _SIMD_ADD( _SIMD_MUL(v3,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w3,h5), _SIMD_MUL(z3,h4)), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2)))));
+   t4 = _SIMD_SUB(_SIMD_MUL(t4,tau6), _SIMD_ADD( _SIMD_MUL(v4,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w4,h5), _SIMD_MUL(z4,h4)), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2)))));
+   t5 = _SIMD_SUB(_SIMD_MUL(t5,tau6), _SIMD_ADD( _SIMD_MUL(v5,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w5,h5), _SIMD_MUL(z5,h4)), _SIMD_ADD(_SIMD_MUL(y5,h3), _SIMD_MUL(x5,h2)))));
+#endif /* __ELPA_USE_FMA__ */
+
+   /////////////////////////////////////////////////////
+   // Rank-1 update of Q [ ROW_LENGTH x nb+3]
+   /////////////////////////////////////////////////////
+#endif /* BLOCK6 */
+
+   q1 = _LOAD(&q[0]);
+#ifdef BLOCK2
+   q1 = _SIMD_ADD(q1, y1);
+#endif
+#ifdef BLOCK4
+   q1 = _SIMD_SUB(q1, w1);
+#endif
+#ifdef BLOCK6
+   q1 = _SIMD_SUB(q1, t1); 
+#endif
+   _STORE(&q[0],q1);
+   q2 = _LOAD(&q[offset]);
+#ifdef BLOCK2
+   q2 = _SIMD_ADD(q2, y2);
+#endif
+#ifdef BLOCK4
+   q2 = _SIMD_SUB(q2, w2);
+#endif
+#ifdef BLOCK6
+   q2 = _SIMD_SUB(q2, t2);
+#endif
+   _STORE(&q[offset],q2);
+   q3 = _LOAD(&q[2*offset]);
+#ifdef BLOCK2
+   q3 = _SIMD_ADD(q3, y3);
+#endif
+#ifdef BLOCK4
+   q3 = _SIMD_SUB(q3, w3);
+#endif
+#ifdef BLOCK6
+   q3 = _SIMD_SUB(q3, t3);
+#endif
+   _STORE(&q[2*offset],q3);
+   q4 = _LOAD(&q[3*offset]);
+#ifdef BLOCK2
+   q4 = _SIMD_ADD(q4, y4);
+#endif
+#ifdef BLOCK4
+   q4 = _SIMD_SUB(q4, w4);
+#endif
+#ifdef BLOCK6
+   q4 = _SIMD_SUB(q4, t4);
+#endif
+   _STORE(&q[3*offset],q4);
+   q5 = _LOAD(&q[4*offset]);
+#ifdef BLOCK2
+   q5 = _SIMD_ADD(q5, y5);
+#endif
+#ifdef BLOCK4
+   q5 = _SIMD_SUB(q5, w5);
+#endif
+#ifdef BLOCK6
+   q5 = _SIMD_SUB(q5, t5);
+#endif
+   _STORE(&q[4*offset],q5);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+   q3 = _LOAD(&q[ldq+2*offset]);
+   q4 = _LOAD(&q[ldq+3*offset]);
+   q5 = _LOAD(&q[ldq+4*offset]);
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_ADD(q1, _SIMD_FMA(y1, h2, x1));
+   q2 = _SIMD_ADD(q2, _SIMD_FMA(y2, h2, x2));
+   q3 = _SIMD_ADD(q3, _SIMD_FMA(y3, h2, x3));
+   q4 = _SIMD_ADD(q4, _SIMD_FMA(y4, h2, x4));
+   q5 = _SIMD_ADD(q5, _SIMD_FMA(y5, h2, x5));
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_ADD(x1, _SIMD_MUL(y1, h2)));
+   q2 = _SIMD_ADD(q2, _SIMD_ADD(x2, _SIMD_MUL(y2, h2)));
+   q3 = _SIMD_ADD(q3, _SIMD_ADD(x3, _SIMD_MUL(y3, h2)));
+   q4 = _SIMD_ADD(q4, _SIMD_ADD(x4, _SIMD_MUL(y4, h2)));
+   q5 = _SIMD_ADD(q5, _SIMD_ADD(x5, _SIMD_MUL(y5, h2)));
+#endif /* __ELPA_USE_FMA__ */
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+   _STORE(&q[ldq+2*offset],q3);
+   _STORE(&q[ldq+3*offset],q4);
+   _STORE(&q[ldq+4*offset],q5);
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+   q3 = _LOAD(&q[ldq+2*offset]);
+   q4 = _LOAD(&q[ldq+3*offset]);
+   q5 = _LOAD(&q[ldq+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_SUB(q1, _SIMD_FMA(w1, h4, z1));
+   q2 = _SIMD_SUB(q2, _SIMD_FMA(w2, h4, z2));
+   q3 = _SIMD_SUB(q3, _SIMD_FMA(w3, h4, z3));
+   q4 = _SIMD_SUB(q4, _SIMD_FMA(w4, h4, z4));
+   q5 = _SIMD_SUB(q5, _SIMD_FMA(w5, h4, z5));
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_ADD(z1, _SIMD_MUL(w1, h4)));
+   q2 = _SIMD_SUB(q2, _SIMD_ADD(z2, _SIMD_MUL(w2, h4)));
+   q3 = _SIMD_SUB(q3, _SIMD_ADD(z3, _SIMD_MUL(w3, h4)));
+   q4 = _SIMD_SUB(q4, _SIMD_ADD(z4, _SIMD_MUL(w4, h4)));
+   q5 = _SIMD_SUB(q5, _SIMD_ADD(z5, _SIMD_MUL(w5, h4)));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+   _STORE(&q[ldq+2*offset],q3);
+   _STORE(&q[ldq+3*offset],q4);
+   _STORE(&q[ldq+4*offset],q5);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q3 = _LOAD(&q[(ldq*2)+2*offset]);
+   q4 = _LOAD(&q[(ldq*2)+3*offset]);
+   q5 = _LOAD(&q[(ldq*2)+4*offset]);
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+   q3 = _SIMD_SUB(q3, y3);
+   q4 = _SIMD_SUB(q4, y4);
+   q5 = _SIMD_SUB(q5, y5);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+   _STORE(&q[(ldq*2)+2*offset],q3);
+   _STORE(&q[(ldq*2)+3*offset],q4);
+   _STORE(&q[(ldq*2)+4*offset],q5);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q3 = _LOAD(&q[(ldq*3)+2*offset]);
+   q4 = _LOAD(&q[(ldq*3)+3*offset]);
+   q5 = _LOAD(&q[(ldq*3)+4*offset]);
+
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+   q3 = _SIMD_SUB(q3, x3);
+   q4 = _SIMD_SUB(q4, x4);
+   q5 = _SIMD_SUB(q5, x5);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*3], q1);
+   _STORE(&q[(ldq*3)+offset], q2);
+   _STORE(&q[(ldq*3)+2*offset], q3);
+   _STORE(&q[(ldq*3)+3*offset], q4);
+   _STORE(&q[(ldq*3)+4*offset], q5);
+
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[(ldq+offset)]);
+   q3 = _LOAD(&q[(ldq+2*offset)]);
+   q4 = _LOAD(&q[(ldq+3*offset)]);
+   q5 = _LOAD(&q[(ldq+4*offset)]);
+   q1 = _SIMD_SUB(q1, v1);
+   q2 = _SIMD_SUB(q2, v2);
+   q3 = _SIMD_SUB(q3, v3);
+   q4 = _SIMD_SUB(q4, v4);
+   q5 = _SIMD_SUB(q5, v5);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+#endif
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[(ldq+offset)],q2);
+   _STORE(&q[(ldq+2*offset)],q3);
+   _STORE(&q[(ldq+3*offset)],q4);
+   _STORE(&q[(ldq+4*offset)],q5);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q3 = _LOAD(&q[(ldq*2)+2*offset]);
+   q4 = _LOAD(&q[(ldq*2)+3*offset]);
+   q5 = _LOAD(&q[(ldq*2)+4*offset]);
+   q1 = _SIMD_SUB(q1, w1); 
+   q2 = _SIMD_SUB(q2, w2);
+   q3 = _SIMD_SUB(q3, w3);
+   q4 = _SIMD_SUB(q4, w4);
+   q5 = _SIMD_SUB(q5, w5);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); 
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));  
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));  
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));  
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+   _STORE(&q[(ldq*2)+2*offset],q3);
+   _STORE(&q[(ldq*2)+3*offset],q4);
+   _STORE(&q[(ldq*2)+4*offset],q5);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q3 = _LOAD(&q[(ldq*3)+2*offset]);
+   q4 = _LOAD(&q[(ldq*3)+3*offset]);
+   q5 = _LOAD(&q[(ldq*3)+4*offset]);
+   q1 = _SIMD_SUB(q1, z1);
+   q2 = _SIMD_SUB(q2, z2);
+   q3 = _SIMD_SUB(q3, z3);
+   q4 = _SIMD_SUB(q4, z4);
+   q5 = _SIMD_SUB(q5, z5);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+#endif
+
+   _STORE(&q[ldq*3],q1);
+   _STORE(&q[(ldq*3)+offset],q2);
+   _STORE(&q[(ldq*3)+2*offset],q3);
+   _STORE(&q[(ldq*3)+3*offset],q4);
+   _STORE(&q[(ldq*3)+4*offset],q5);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*4]);
+   q2 = _LOAD(&q[(ldq*4)+offset]);
+   q3 = _LOAD(&q[(ldq*4)+2*offset]);
+   q4 = _LOAD(&q[(ldq*4)+3*offset]);
+   q5 = _LOAD(&q[(ldq*4)+4*offset]);
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+   q3 = _SIMD_SUB(q3, y3);
+   q4 = _SIMD_SUB(q4, y4);
+   q5 = _SIMD_SUB(q5, y5);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+#endif
+
+   _STORE(&q[ldq*4],q1);
+   _STORE(&q[(ldq*4)+offset],q2);
+   _STORE(&q[(ldq*4)+2*offset],q3);
+   _STORE(&q[(ldq*4)+3*offset],q4);
+   _STORE(&q[(ldq*4)+4*offset],q5);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[(ldh)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[(ldh)+1], hh[(ldh)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[(ldh)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*5]);
+   q2 = _LOAD(&q[(ldq*5)+offset]);
+   q3 = _LOAD(&q[(ldq*5)+2*offset]);
+   q4 = _LOAD(&q[(ldq*5)+3*offset]);
+   q5 = _LOAD(&q[(ldq*5)+4*offset]);
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+   q3 = _SIMD_SUB(q3, x3);
+   q4 = _SIMD_SUB(q4, x4);
+   q5 = _SIMD_SUB(q5, x5);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+   q5 = _SIMD_NFMA(t5, h6, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+#endif
+
+   _STORE(&q[ldq*5],q1);
+   _STORE(&q[(ldq*5)+offset],q2);
+   _STORE(&q[(ldq*5)+2*offset],q3);
+   _STORE(&q[(ldq*5)+3*offset],q4);
+   _STORE(&q[(ldq*5)+4*offset],q5);
+
+#endif /* BLOCK6 */
+
+   for (i = BLOCK; i < nb; i++)
+   {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+    h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif
+
+     q1 = _LOAD(&q[i*ldq]);
+     q2 = _LOAD(&q[(i*ldq)+offset]);
+     q3 = _LOAD(&q[(i*ldq)+2*offset]);
+     q4 = _LOAD(&q[(i*ldq)+3*offset]);
+     q5 = _LOAD(&q[(i*ldq)+4*offset]);
+
+#ifdef BLOCK2
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_FMA(x1, h1, q1);
+     q1 = _SIMD_FMA(y1, h2, q1);
+     q2 = _SIMD_FMA(x2, h1, q2);
+     q2 = _SIMD_FMA(y2, h2, q2);
+     q3 = _SIMD_FMA(x3, h1, q3);
+     q3 = _SIMD_FMA(y3, h2, q3);
+     q4 = _SIMD_FMA(x4, h1, q4);
+     q4 = _SIMD_FMA(y4, h2, q4);
+     q5 = _SIMD_FMA(x5, h1, q5);
+     q5 = _SIMD_FMA(y5, h2, q5);
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADD(_SIMD_MUL(x1,h1), _SIMD_MUL(y1, h2)));
+     q2 = _SIMD_ADD(q2, _SIMD_ADD(_SIMD_MUL(x2,h1), _SIMD_MUL(y2, h2)));
+     q3 = _SIMD_ADD(q3, _SIMD_ADD(_SIMD_MUL(x3,h1), _SIMD_MUL(y3, h2)));
+     q4 = _SIMD_ADD(q4, _SIMD_ADD(_SIMD_MUL(x4,h1), _SIMD_MUL(y4, h2)));
+     q5 = _SIMD_ADD(q5, _SIMD_ADD(_SIMD_MUL(x5,h1), _SIMD_MUL(y5, h2)));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+          
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(x1, h1, q1);
+     q2 = _SIMD_NFMA(x2, h1, q2);
+     q3 = _SIMD_NFMA(x3, h1, q3);
+     q4 = _SIMD_NFMA(x4, h1, q4);
+     q5 = _SIMD_NFMA(x5, h1, q5);
+#else  
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(x1,h1));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(x2,h1));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(x3,h1));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(x4,h1));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(x5,h1));
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(y1, h2, q1);
+     q2 = _SIMD_NFMA(y2, h2, q2);
+     q3 = _SIMD_NFMA(y3, h2, q3);
+     q4 = _SIMD_NFMA(y4, h2, q4);
+     q5 = _SIMD_NFMA(y5, h2, q5);
+#else   
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(y1,h2));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(y2,h2));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(y3,h2));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(y4,h2));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(y5,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(z1, h3, q1);
+     q2 = _SIMD_NFMA(z2, h3, q2);
+     q3 = _SIMD_NFMA(z3, h3, q3);
+     q4 = _SIMD_NFMA(z4, h3, q4);
+     q5 = _SIMD_NFMA(z5, h3, q5);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(z1,h3));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(z2,h3));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(z3,h3));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(z4,h3));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(z5,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); 
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(w1, h4, q1);
+     q2 = _SIMD_NFMA(w2, h4, q2);
+     q3 = _SIMD_NFMA(w3, h4, q3);
+     q4 = _SIMD_NFMA(w4, h4, q4);
+     q5 = _SIMD_NFMA(w5, h4, q5);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(w1,h4));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(w2,h4));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(w3,h4));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(w4,h4));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(w5,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+
+#if VEC_SET == AVX_256
+     h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(v1, h5, q1);
+     q2 = _SIMD_NFMA(v2, h5, q2);
+     q3 = _SIMD_NFMA(v3, h5, q3);
+     q4 = _SIMD_NFMA(v4, h5, q4);
+     q5 = _SIMD_NFMA(v5, h5, q5);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+#endif /* __ELPA_USE_FMA__ */
+ 
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+
+#if VEC_SET == AVX_256
+     h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(t1, h6, q1);
+     q2 = _SIMD_NFMA(t2, h6, q2);
+     q3 = _SIMD_NFMA(t3, h6, q3);
+     q4 = _SIMD_NFMA(t4, h6, q4);
+     q5 = _SIMD_NFMA(t5, h6, q5);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+     q5 = _SIMD_SUB(q5, _SIMD_MUL(t5, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+     _STORE(&q[i*ldq],q1);
+     _STORE(&q[(i*ldq)+offset],q2);
+     _STORE(&q[(i*ldq)+2*offset],q3);
+     _STORE(&q[(i*ldq)+3*offset],q4);
+     _STORE(&q[(i*ldq)+4*offset],q5);
+
+   }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+   q1 = _LOAD(&q[nb*ldq]);
+   q2 = _LOAD(&q[(nb*ldq)+offset]);
+   q3 = _LOAD(&q[(nb*ldq)+2*offset]);
+   q4 = _LOAD(&q[(nb*ldq)+3*offset]);
+   q5 = _LOAD(&q[(nb*ldq)+4*offset]);
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_FMA(x1, h1, q1);
+   q2 = _SIMD_FMA(x2, h1, q2);
+   q3 = _SIMD_FMA(x3, h1, q3);
+   q4 = _SIMD_FMA(x4, h1, q4);
+   q5 = _SIMD_FMA(x5, h1, q5);
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_ADD(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_ADD(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_ADD(q5, _SIMD_MUL(x5, h1));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+#else   
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+#endif
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+   q5 = _SIMD_NFMA(v5, h5, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(v5, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[nb*ldq],q1);
+   _STORE(&q[(nb*ldq)+offset],q2);
+   _STORE(&q[(nb*ldq)+2*offset],q3);
+   _STORE(&q[(nb*ldq)+3*offset],q4);
+   _STORE(&q[(nb*ldq)+4*offset],q5);
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+1)*ldq]);
+   q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+   q5 = _LOAD(&q[((nb+1)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+   q5 = _SIMD_NFMA(w5, h4, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(w5, h4));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+1)*ldq],q1);
+   _STORE(&q[((nb+1)*ldq)+offset],q2);
+   _STORE(&q[((nb+1)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+1)*ldq)+3*offset],q4);
+   _STORE(&q[((nb+1)*ldq)+4*offset],q5);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+2)*ldq]);
+   q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+   q5 = _LOAD(&q[((nb+2)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+   q5 = _SIMD_NFMA(z5, h3, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(z5, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+2)*ldq],q1);
+   _STORE(&q[((nb+2)*ldq)+offset],q2);
+   _STORE(&q[((nb+2)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+2)*ldq)+3*offset],q4);
+   _STORE(&q[((nb+2)*ldq)+4*offset],q5);
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+   q1 = _LOAD(&q[(nb+3)*ldq]);
+   q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+3)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+3)*ldq)+3*offset]);
+   q5 = _LOAD(&q[((nb+3)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+   q5 = _SIMD_NFMA(y5, h2, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(y5, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+3)*ldq],q1);
+   _STORE(&q[((nb+3)*ldq)+offset],q2);
+   _STORE(&q[((nb+3)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+3)*ldq)+3*offset],q4);
+   _STORE(&q[((nb+3)*ldq)+4*offset],q5);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+   q1 = _LOAD(&q[(nb+4)*ldq]);
+   q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+4)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+4)*ldq)+3*offset]);
+   q5 = _LOAD(&q[((nb+4)*ldq)+4*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+   q5 = _SIMD_NFMA(x5, h1, q5);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+   q5 = _SIMD_SUB(q5, _SIMD_MUL(x5, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+4)*ldq],q1);
+   _STORE(&q[((nb+4)*ldq)+offset],q2);
+   _STORE(&q[((nb+4)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+4)*ldq)+3*offset],q4);
+   _STORE(&q[((nb+4)*ldq)+4*offset],q5);
+
+#endif /* BLOCK6 */
+}
+
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 64
+#endif
+#endif /* VEC_SET == AVX_512 */
+/*
+ * Unrolled kernel that computes
+ * ROW_LENGTH rows of Q simultaneously, a
+ * matrix Vector product with two householder
+ */
+#ifdef BLOCK2
+/*
+ * vectors + a rank 2 update is performed
+ */
+#endif
+#if defined(BLOCK4) || defined(BLOCK6)
+/*
+ * vectors + a rank 1 update is performed
+ */
+#endif
+
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh,
+#ifdef BLOCK2
+               DATA_TYPE s)
+#endif
+#ifdef BLOCK4
+               DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4)
+#endif
+#ifdef BLOCK6
+               DATA_TYPE_PTR scalarprods)
+#endif
+  {
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ROW_LENGTH x nb+1] * hh
+    // hh contains two householder vectors, with offset 1
+    /////////////////////////////////////////////////////
+#endif
+#if defined(BLOCK4) || defined(BLOCK6)
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ROW_LENGTH x nb+3] * hh
+    // hh contains four householder vectors
+    /////////////////////////////////////////////////////
+#endif
+
+    int i;
+#ifdef BLOCK2
+#if VEC_SET == SSE_128
+    // Needed bit mask for floating point sign flip
+#ifdef DOUBLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == VSX_SSE
+    __SIMD_DATATYPE sign = vec_splats(-1.0);
+#endif
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+    __SIMD_DATATYPE x1 = _LOAD(&q[ldq]);
+    __SIMD_DATATYPE x2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE x3 = _LOAD(&q[ldq+2*offset]);
+    __SIMD_DATATYPE x4 = _LOAD(&q[ldq+3*offset]);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h1 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+    __SIMD_DATATYPE h2;
+
+#ifdef __ELPA_USE_FMA__
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_FMA(x1, h1, q1);
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE y2 = _SIMD_FMA(x2, h1, q2);
+    __SIMD_DATATYPE q3 = _LOAD(&q[2*offset]);
+    __SIMD_DATATYPE y3 = _SIMD_FMA(x3, h1, q3);
+    __SIMD_DATATYPE q4 = _LOAD(&q[3*offset]);
+    __SIMD_DATATYPE y4 = _SIMD_FMA(x4, h1, q4);
+#else
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE y2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+    __SIMD_DATATYPE q3 = _LOAD(&q[2*offset]);
+    __SIMD_DATATYPE y3 = _SIMD_ADD(q3, _SIMD_MUL(x3, h1));
+    __SIMD_DATATYPE q4 = _LOAD(&q[3*offset]);
+    __SIMD_DATATYPE y4 = _SIMD_ADD(q4, _SIMD_MUL(x4, h1));
+#endif
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+    register __SIMD_DATATYPE x1 = a1_1;
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));                          
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));                          
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));                          
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1));
+    register __SIMD_DATATYPE x1 = a1_1;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*3)+offset]);                  
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[0+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+    register __SIMD_DATATYPE x2 = a1_2;
+#else
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+    register __SIMD_DATATYPE x2 = a1_2;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_3 = _LOAD(&q[(ldq*3)+2*offset]);
+    __SIMD_DATATYPE a2_3 = _LOAD(&q[(ldq*2)+2*offset]);
+    __SIMD_DATATYPE a3_3 = _LOAD(&q[ldq+2*offset]);
+    __SIMD_DATATYPE a4_3 = _LOAD(&q[0+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
+    w3 = _SIMD_FMA(a2_3, h_4_2, w3);
+    w3 = _SIMD_FMA(a1_3, h_4_1, w3);
+    register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
+    z3 = _SIMD_FMA(a1_3, h_3_1, z3);
+    register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
+    register __SIMD_DATATYPE x3 = a1_3;
+#else
+    register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
+    register __SIMD_DATATYPE z3 = _SIMD_ADD(a3_3, _SIMD_MUL(a2_3, h_3_2));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
+    register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
+    register __SIMD_DATATYPE x3 = a1_3;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_4 = _LOAD(&q[(ldq*3)+3*offset]);
+    __SIMD_DATATYPE a2_4 = _LOAD(&q[(ldq*2)+3*offset]);
+    __SIMD_DATATYPE a3_4 = _LOAD(&q[ldq+3*offset]);
+    __SIMD_DATATYPE a4_4 = _LOAD(&q[0+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w4 = _SIMD_FMA(a3_4, h_4_3, a4_4);
+    w4 = _SIMD_FMA(a2_4, h_4_2, w4);
+    w4 = _SIMD_FMA(a1_4, h_4_1, w4);
+    register __SIMD_DATATYPE z4 = _SIMD_FMA(a2_4, h_3_2, a3_4);
+    z4 = _SIMD_FMA(a1_4, h_3_1, z4);
+    register __SIMD_DATATYPE y4 = _SIMD_FMA(a1_4, h_2_1, a2_4);
+    register __SIMD_DATATYPE x4 = a1_4;
+#else
+    register __SIMD_DATATYPE w4 = _SIMD_ADD(a4_4, _SIMD_MUL(a3_4, h_4_3));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a2_4, h_4_2));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a1_4, h_4_1));
+    register __SIMD_DATATYPE z4 = _SIMD_ADD(a3_4, _SIMD_MUL(a2_4, h_3_2));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(a1_4, h_3_1));
+    register __SIMD_DATATYPE y4 = _SIMD_ADD(a2_4, _SIMD_MUL(a1_4, h_2_1));
+    register __SIMD_DATATYPE x4 = a1_4;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+    __SIMD_DATATYPE q3;
+    __SIMD_DATATYPE q4;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+    
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*5]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*4]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a6_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET1(hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_6_5 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t1 = _SIMD_FMA(a5_1, h_6_5, a6_1);
+    t1 = _SIMD_FMA(a4_1, h_6_4, t1);
+    t1 = _SIMD_FMA(a3_1, h_6_3, t1);
+    t1 = _SIMD_FMA(a2_1, h_6_2, t1);
+    t1 = _SIMD_FMA(a1_1, h_6_1, t1);
+#else
+    register __SIMD_DATATYPE t1 = _SIMD_ADD(a6_1, _SIMD_MUL(a5_1, h_6_5)); 
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a4_1, h_6_4));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a3_1, h_6_3));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a2_1, h_6_2));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_5_4 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE v1 = _SIMD_FMA(a4_1, h_5_4, a5_1);
+    v1 = _SIMD_FMA(a3_1, h_5_3, v1);
+    v1 = _SIMD_FMA(a2_1, h_5_2, v1);
+    v1 = _SIMD_FMA(a1_1, h_5_1, v1);
+#else
+    register __SIMD_DATATYPE v1 = _SIMD_ADD(a5_1, _SIMD_MUL(a4_1, h_5_4)); 
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a3_1, h_5_3));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a2_1, h_5_2));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3)); 
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+#else
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1)); 
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x1 = a1_1;
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*5)+offset]);
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*4)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[(ldq*3)+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a5_2 = _LOAD(&q[(ldq)+offset]);
+    __SIMD_DATATYPE a6_2 = _LOAD(&q[offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t2 = _SIMD_FMA(a5_2, h_6_5, a6_2);
+    t2 = _SIMD_FMA(a4_2, h_6_4, t2);
+    t2 = _SIMD_FMA(a3_2, h_6_3, t2);
+    t2 = _SIMD_FMA(a2_2, h_6_2, t2);
+    t2 = _SIMD_FMA(a1_2, h_6_1, t2);
+    register __SIMD_DATATYPE v2 = _SIMD_FMA(a4_2, h_5_4, a5_2);
+    v2 = _SIMD_FMA(a3_2, h_5_3, v2);
+    v2 = _SIMD_FMA(a2_2, h_5_2, v2);
+    v2 = _SIMD_FMA(a1_2, h_5_1, v2);
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+#else
+    register __SIMD_DATATYPE t2 = _SIMD_ADD(a6_2, _SIMD_MUL(a5_2, h_6_5));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a4_2, h_6_4));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a3_2, h_6_3));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a2_2, h_6_2));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a1_2, h_6_1));
+    register __SIMD_DATATYPE v2 = _SIMD_ADD(a5_2, _SIMD_MUL(a4_2, h_5_4));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a3_2, h_5_3));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a2_2, h_5_2));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a1_2, h_5_1));
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x2 = a1_2;
+
+    __SIMD_DATATYPE a1_3 = _LOAD(&q[(ldq*5)+2*offset]);
+    __SIMD_DATATYPE a2_3 = _LOAD(&q[(ldq*4)+2*offset]);
+    __SIMD_DATATYPE a3_3 = _LOAD(&q[(ldq*3)+2*offset]);
+    __SIMD_DATATYPE a4_3 = _LOAD(&q[(ldq*2)+2*offset]);
+    __SIMD_DATATYPE a5_3 = _LOAD(&q[(ldq)+2*offset]);
+    __SIMD_DATATYPE a6_3 = _LOAD(&q[2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t3 = _SIMD_FMA(a5_3, h_6_5, a6_3);
+    t3 = _SIMD_FMA(a4_3, h_6_4, t3);
+    t3 = _SIMD_FMA(a3_3, h_6_3, t3);
+    t3 = _SIMD_FMA(a2_3, h_6_2, t3);
+    t3 = _SIMD_FMA(a1_3, h_6_1, t3);
+    register __SIMD_DATATYPE v3 = _SIMD_FMA(a4_3, h_5_4, a5_3);
+    v3 = _SIMD_FMA(a3_3, h_5_3, v3);
+    v3 = _SIMD_FMA(a2_3, h_5_2, v3);
+    v3 = _SIMD_FMA(a1_3, h_5_1, v3);
+    register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
+    w3 = _SIMD_FMA(a2_3, h_4_2, w3);
+    w3 = _SIMD_FMA(a1_3, h_4_1, w3);
+    register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
+    z3 = _SIMD_FMA(a1_3, h_3_1, z3);
+    register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
+#else
+    register __SIMD_DATATYPE t3 = _SIMD_ADD(a6_3, _SIMD_MUL(a5_3, h_6_5));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a4_3, h_6_4));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a3_3, h_6_3));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a2_3, h_6_2));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a1_3, h_6_1));
+    register __SIMD_DATATYPE v3 = _SIMD_ADD(a5_3, _SIMD_MUL(a4_3, h_5_4));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a3_3, h_5_3));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a2_3, h_5_2));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a1_3, h_5_1));
+    register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
+    register __SIMD_DATATYPE z3 = _SIMD_ADD(a3_3, _SIMD_MUL(a2_3, h_3_2));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
+    register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+ 
+    register __SIMD_DATATYPE x3 = a1_3;
+
+    __SIMD_DATATYPE a1_4 = _LOAD(&q[(ldq*5)+3*offset]);
+    __SIMD_DATATYPE a2_4 = _LOAD(&q[(ldq*4)+3*offset]);
+    __SIMD_DATATYPE a3_4 = _LOAD(&q[(ldq*3)+3*offset]);
+    __SIMD_DATATYPE a4_4 = _LOAD(&q[(ldq*2)+3*offset]);
+    __SIMD_DATATYPE a5_4 = _LOAD(&q[(ldq)+3*offset]);
+    __SIMD_DATATYPE a6_4 = _LOAD(&q[3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t4 = _SIMD_FMA(a5_4, h_6_5, a6_4);
+    t4 = _SIMD_FMA(a4_4, h_6_4, t4);
+    t4 = _SIMD_FMA(a3_4, h_6_3, t4);
+    t4 = _SIMD_FMA(a2_4, h_6_2, t4);
+    t4 = _SIMD_FMA(a1_4, h_6_1, t4);
+    register __SIMD_DATATYPE v4 = _SIMD_FMA(a4_4, h_5_4, a5_4);
+    v4 = _SIMD_FMA(a3_4, h_5_3, v4);
+    v4 = _SIMD_FMA(a2_4, h_5_2, v4);
+    v4 = _SIMD_FMA(a1_4, h_5_1, v4);
+    register __SIMD_DATATYPE w4 = _SIMD_FMA(a3_4, h_4_3, a4_4);
+    w4 = _SIMD_FMA(a2_4, h_4_2, w4);
+    w4 = _SIMD_FMA(a1_4, h_4_1, w4);
+    register __SIMD_DATATYPE z4 = _SIMD_FMA(a2_4, h_3_2, a3_4);
+    z4 = _SIMD_FMA(a1_4, h_3_1, z4);
+    register __SIMD_DATATYPE y4 = _SIMD_FMA(a1_4, h_2_1, a2_4);
+#else
+    register __SIMD_DATATYPE t4 = _SIMD_ADD(a6_4, _SIMD_MUL(a5_4, h_6_5));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a4_4, h_6_4));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a3_4, h_6_3));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a2_4, h_6_2));
+    t4 = _SIMD_ADD(t4, _SIMD_MUL(a1_4, h_6_1));
+    register __SIMD_DATATYPE v4 = _SIMD_ADD(a5_4, _SIMD_MUL(a4_4, h_5_4));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a3_4, h_5_3));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a2_4, h_5_2));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(a1_4, h_5_1));
+    register __SIMD_DATATYPE w4 = _SIMD_ADD(a4_4, _SIMD_MUL(a3_4, h_4_3));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a2_4, h_4_2));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(a1_4, h_4_1));
+    register __SIMD_DATATYPE z4 = _SIMD_ADD(a3_4, _SIMD_MUL(a2_4, h_3_2));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(a1_4, h_3_1));
+    register __SIMD_DATATYPE y4 = _SIMD_ADD(a2_4, _SIMD_MUL(a1_4, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+ 
+    register __SIMD_DATATYPE x4 = a1_4;
+
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+    __SIMD_DATATYPE q3;
+    __SIMD_DATATYPE q4;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+    __SIMD_DATATYPE h5;
+    __SIMD_DATATYPE h6;
+
+#endif /* BLOCK6 */
+
+    for(i = BLOCK; i < nb; i++)
+      {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if  VEC_SET == AVX_256
+        h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+        h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif /*   VEC_SET == AVX_256 */
+
+        q1 = _LOAD(&q[i*ldq]);
+        q2 = _LOAD(&q[(i*ldq)+offset]);
+        q3 = _LOAD(&q[(i*ldq)+2*offset]);
+        q4 = _LOAD(&q[(i*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+        x1 = _SIMD_FMA(q1, h1, x1);
+        y1 = _SIMD_FMA(q1, h2, y1);
+        x2 = _SIMD_FMA(q2, h1, x2);
+        y2 = _SIMD_FMA(q2, h2, y2);
+        x3 = _SIMD_FMA(q3, h1, x3);
+        y3 = _SIMD_FMA(q3, h2, y3);
+        x4 = _SIMD_FMA(q4, h1, x4);
+        y4 = _SIMD_FMA(q4, h2, y4);
+#else
+        x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+        y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+        x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+        y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+        x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+        y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+        x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+        y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        z1 = _SIMD_FMA(q1, h3, z1);
+        z2 = _SIMD_FMA(q2, h3, z2);
+        z3 = _SIMD_FMA(q3, h3, z3);
+        z4 = _SIMD_FMA(q4, h3, z4);
+#else
+        z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+        z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+        z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+        z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        w1 = _SIMD_FMA(q1, h4, w1);
+        w2 = _SIMD_FMA(q2, h4, w2);
+        w3 = _SIMD_FMA(q3, h4, w3);
+        w4 = _SIMD_FMA(q4, h4, w4);
+#else
+        w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+        w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+        w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+        w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+#endif /* __ELPA_USE_FMA__ */
+		
+#endif /* BLOCK4 || BLOCK6 */
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h5 = _SIMD_SET1(hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h5 = _SIMD_SET(hh[(ldh*4)+i-(BLOCK-5)], hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+#if VEC_SET == AVX_256
+        h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        v1 = _SIMD_FMA(q1, h5, v1);
+        v2 = _SIMD_FMA(q2, h5, v2);
+        v3 = _SIMD_FMA(q3, h5, v3);
+        v4 = _SIMD_FMA(q4, h5, v4);
+#else
+        v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+        v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+        v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5));
+        v4 = _SIMD_ADD(v4, _SIMD_MUL(q4,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+
+#if VEC_SET == AVX_256
+	h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        t1 = _SIMD_FMA(q1, h6, t1);
+        t2 = _SIMD_FMA(q2, h6, t2);
+        t3 = _SIMD_FMA(q3, h6, t3);
+        t4 = _SIMD_FMA(q4, h6, t4);
+#else
+        t1 = _SIMD_ADD(t1, _SIMD_MUL(q1,h6));
+        t2 = _SIMD_ADD(t2, _SIMD_MUL(q2,h6));
+        t3 = _SIMD_ADD(t3, _SIMD_MUL(q3,h6));
+        t4 = _SIMD_ADD(t4, _SIMD_MUL(q4,h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+      }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+    q1 = _LOAD(&q[nb*ldq]);
+    q2 = _LOAD(&q[(nb*ldq)+offset]);
+    q3 = _LOAD(&q[(nb*ldq)+2*offset]);
+    q4 = _LOAD(&q[(nb*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+    
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+#endif
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[(ldh*1)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 */
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-(BLOCK-4)], hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+    w3 = _SIMD_FMA(q3, h4, w3);
+    w4 = _SIMD_FMA(q4, h4, w4);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); 
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h5 = _SIMD_SET1(hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h5 = _SIMD_SET(hh[(ldh*4)+nb-(BLOCK-5)], hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    v1 = _SIMD_FMA(q1, h5, v1);
+    v2 = _SIMD_FMA(q2, h5, v2);
+    v3 = _SIMD_FMA(q3, h5, v3);
+    v4 = _SIMD_FMA(q4, h5, v4);
+#else
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5));
+    v4 = _SIMD_ADD(v4, _SIMD_MUL(q4,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-4)], hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-(BLOCK-5)], hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+    w3 = _SIMD_FMA(q3, h4, w3);
+    w4 = _SIMD_FMA(q4, h4, w4);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+    w4 = _SIMD_ADD(w4, _SIMD_MUL(q4,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-4)], hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-5)], hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+    z4 = _SIMD_FMA(q4, h3, z4);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+    z4 = _SIMD_ADD(z4, _SIMD_MUL(q4,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-4)], hh[nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-4)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+3)*ldq]);
+    q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+3)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+3)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-5)], hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+    y4 = _SIMD_FMA(q4, h2, y4);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+    y4 = _SIMD_ADD(y4, _SIMD_MUL(q4,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-5)], hh[nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-5)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+4)*ldq]);
+    q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+4)*ldq)+2*offset]);
+    q4 = _LOAD(&q[((nb+4)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+    x4 = _SIMD_FMA(q4, h1, x4);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+    x4 = _SIMD_ADD(x4, _SIMD_MUL(q4,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Rank-2 update of Q [ ROW_LENGTH x nb+1]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK4
+    /////////////////////////////////////////////////////
+    // Rank-1 update of Q [ ROW_LENGTH x nb+3]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK6
+    /////////////////////////////////////////////////////
+    // Apply tau, correct wrong calculation using pre-calculated scalar products
+    /////////////////////////////////////////////////////
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]);
+
+    __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET1(hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET1(hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET1(hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET1(hh[ldh*5]);       
+#endif
+
+#ifdef BLOCK2    
+    __SIMD_DATATYPE vs = _SIMD_SET1(s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(s_1_3);  
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(s_1_4);  
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(s_2_4);  
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET1(scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET1(scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET1(scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET1(scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET1(scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET1(scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET1(scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET1(scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET1(scalarprods[14]);
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE */
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE tau1 = _SIMD_SET(hh[0], hh[0]);
+
+    __SIMD_DATATYPE tau2 = _SIMD_SET(hh[ldh], hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET(hh[ldh*2], hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET(hh[ldh*3], hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET(hh[ldh*4], hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET(hh[ldh*5], hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2
+    __SIMD_DATATYPE vs = _SIMD_SET(s, s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(s_1_2, s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(s_1_3, s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(s_2_3, s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(s_1_4, s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(s_2_4, s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(s_3_4, s_3_4);
+
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(scalarprods[0], scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(scalarprods[1], scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(scalarprods[2], scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(scalarprods[3], scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(scalarprods[4], scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(scalarprods[5], scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET(scalarprods[6], scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET(scalarprods[7], scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET(scalarprods[8], scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET(scalarprods[9], scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET(scalarprods[10], scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET(scalarprods[11], scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET(scalarprods[12], scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET(scalarprods[13], scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET(scalarprods[14], scalarprods[14]);
+#endif
+#endif /* VEC_SET == SPARC64_SSE */
+
+#if VEC_SET == AVX_256
+   __SIMD_DATATYPE tau1 = _SIMD_BROADCAST(hh);
+   __SIMD_DATATYPE tau2 = _SIMD_BROADCAST(&hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_BROADCAST(&hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_BROADCAST(&hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_BROADCAST(&hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_BROADCAST(&hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2  
+   __SIMD_DATATYPE vs = _SIMD_BROADCAST(&s);
+#endif
+
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_BROADCAST(&scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_BROADCAST(&scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_BROADCAST(&scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_BROADCAST(&scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_BROADCAST(&scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_BROADCAST(&scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_BROADCAST(&scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_BROADCAST(&scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_BROADCAST(&scalarprods[14]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+    h1 = _XOR(tau1, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_NEG(tau1);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau1, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau1;
+#endif
+
+   x1 = _SIMD_MUL(x1, h1);
+   x2 = _SIMD_MUL(x2, h1);
+   x3 = _SIMD_MUL(x3, h1);
+   x4 = _SIMD_MUL(x4, h1);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+   h1 = _XOR(tau2, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_NEG(tau2);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau2, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+   h2 = _SIMD_MUL(h1, vs);
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau2;
+   h2 = _SIMD_MUL(h1, vs_1_2);
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMA(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMA(y2, h1, _SIMD_MUL(x2,h2));
+   y3 = _SIMD_FMA(y3, h1, _SIMD_MUL(x3,h2));
+   y4 = _SIMD_FMA(y4, h1, _SIMD_MUL(x4,h2));
+#else
+   y1 = _SIMD_ADD(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_ADD(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+   y3 = _SIMD_ADD(_SIMD_MUL(y3,h1), _SIMD_MUL(x3,h2));
+   y4 = _SIMD_ADD(_SIMD_MUL(y4,h1), _SIMD_MUL(x4,h2));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMSUB(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMSUB(y2, h1, _SIMD_MUL(x2,h2));
+   y3 = _SIMD_FMSUB(y3, h1, _SIMD_MUL(x3,h2));
+   y4 = _SIMD_FMSUB(y4, h1, _SIMD_MUL(x4,h2));
+#else   
+   y1 = _SIMD_SUB(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_SUB(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+   y3 = _SIMD_SUB(_SIMD_MUL(y3,h1), _SIMD_MUL(x3,h2));
+   y4 = _SIMD_SUB(_SIMD_MUL(y4,h1), _SIMD_MUL(x4,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau3;
+   h2 = _SIMD_MUL(h1, vs_1_3);
+   h3 = _SIMD_MUL(h1, vs_2_3);
+
+#ifdef __ELPA_USE_FMA__
+   z1 = _SIMD_FMSUB(z1, h1, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_FMSUB(z2, h1, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)));
+   z3 = _SIMD_FMSUB(z3, h1, _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2)));
+   z4 = _SIMD_FMSUB(z4, h1, _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2)));
+#else
+   z1 = _SIMD_SUB(_SIMD_MUL(z1,h1), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_SUB(_SIMD_MUL(z2,h1), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)));
+   z3 = _SIMD_SUB(_SIMD_MUL(z3,h1), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2)));
+   z4 = _SIMD_SUB(_SIMD_MUL(z4,h1), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2)));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau4;
+   h2 = _SIMD_MUL(h1, vs_1_4);
+   h3 = _SIMD_MUL(h1, vs_2_4);
+   h4 = _SIMD_MUL(h1, vs_3_4);
+
+#ifdef __ELPA_USE_FMA__
+   w1 = _SIMD_FMSUB(w1, h1, _SIMD_FMA(z1, h4, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   w2 = _SIMD_FMSUB(w2, h1, _SIMD_FMA(z2, h4, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+   w3 = _SIMD_FMSUB(w3, h1, _SIMD_FMA(z3, h4, _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2))));
+   w4 = _SIMD_FMSUB(w4, h1, _SIMD_FMA(z4, h4, _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2))));
+#else
+   w1 = _SIMD_SUB(_SIMD_MUL(w1,h1), _SIMD_ADD(_SIMD_MUL(z1,h4), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+   w2 = _SIMD_SUB(_SIMD_MUL(w2,h1), _SIMD_ADD(_SIMD_MUL(z2,h4), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+   w3 = _SIMD_SUB(_SIMD_MUL(w3,h1), _SIMD_ADD(_SIMD_MUL(z3,h4), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2))));
+   w4 = _SIMD_SUB(_SIMD_MUL(w4,h1), _SIMD_ADD(_SIMD_MUL(z4,h4), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+   h2 = _SIMD_MUL(tau5, vs_1_5); 
+   h3 = _SIMD_MUL(tau5, vs_2_5);
+   h4 = _SIMD_MUL(tau5, vs_3_5);
+   h5 = _SIMD_MUL(tau5, vs_4_5);
+
+#ifdef __ELPA_USE_FMA__
+   v1 = _SIMD_FMSUB(v1, tau5, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_FMSUB(v2, tau5, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+   v3 = _SIMD_FMSUB(v3, tau5, _SIMD_ADD(_SIMD_FMA(w3, h5, _SIMD_MUL(z3,h4)), _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2))));
+   v4 = _SIMD_FMSUB(v4, tau5, _SIMD_ADD(_SIMD_FMA(w4, h5, _SIMD_MUL(z4,h4)), _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2))));
+#else
+   v1 = _SIMD_SUB(_SIMD_MUL(v1,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_SUB(_SIMD_MUL(v2,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+   v3 = _SIMD_SUB(_SIMD_MUL(v3,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w3,h5), _SIMD_MUL(z3,h4)), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2))));
+   v4 = _SIMD_SUB(_SIMD_MUL(v4,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w4,h5), _SIMD_MUL(z4,h4)), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+   h2 = _SIMD_MUL(tau6, vs_1_6);
+   h3 = _SIMD_MUL(tau6, vs_2_6);
+   h4 = _SIMD_MUL(tau6, vs_3_6);
+   h5 = _SIMD_MUL(tau6, vs_4_6);
+   h6 = _SIMD_MUL(tau6, vs_5_6);
+
+#ifdef __ELPA_USE_FMA__
+   t1 = _SIMD_FMSUB(t1, tau6, _SIMD_FMA(v1, h6, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_FMSUB(t2, tau6, _SIMD_FMA(v2, h6, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)))));
+   t3 = _SIMD_FMSUB(t3, tau6, _SIMD_FMA(v3, h6, _SIMD_ADD(_SIMD_FMA(w3, h5, _SIMD_MUL(z3,h4)), _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2)))));
+   t4 = _SIMD_FMSUB(t4, tau6, _SIMD_FMA(v4, h6, _SIMD_ADD(_SIMD_FMA(w4, h5, _SIMD_MUL(z4,h4)), _SIMD_FMA(y4, h3, _SIMD_MUL(x4,h2)))));
+#else
+   t1 = _SIMD_SUB(_SIMD_MUL(t1,tau6), _SIMD_ADD( _SIMD_MUL(v1,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_SUB(_SIMD_MUL(t2,tau6), _SIMD_ADD( _SIMD_MUL(v2,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)))));
+   t3 = _SIMD_SUB(_SIMD_MUL(t3,tau6), _SIMD_ADD( _SIMD_MUL(v3,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w3,h5), _SIMD_MUL(z3,h4)), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2)))));
+   t4 = _SIMD_SUB(_SIMD_MUL(t4,tau6), _SIMD_ADD( _SIMD_MUL(v4,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w4,h5), _SIMD_MUL(z4,h4)), _SIMD_ADD(_SIMD_MUL(y4,h3), _SIMD_MUL(x4,h2)))));
+#endif /* __ELPA_USE_FMA__ */
+
+    /////////////////////////////////////////////////////
+   // Rank-1 update of Q [ ROW_LENGTH x nb+3]
+   /////////////////////////////////////////////////////
+#endif /* BLOCK6 */
+
+   q1 = _LOAD(&q[0]);
+   q2 = _LOAD(&q[offset]);
+   q3 = _LOAD(&q[2*offset]);
+   q4 = _LOAD(&q[3*offset]);
+
+#ifdef BLOCK2
+   q1 = _SIMD_ADD(q1, y1);
+   q2 = _SIMD_ADD(q2, y2);
+   q3 = _SIMD_ADD(q3, y3);
+   q4 = _SIMD_ADD(q4, y4);
+#endif
+#ifdef BLOCK4
+   q1 = _SIMD_SUB(q1, w1);
+   q2 = _SIMD_SUB(q2, w2);
+   q3 = _SIMD_SUB(q3, w3);
+   q4 = _SIMD_SUB(q4, w4);
+#endif
+#ifdef BLOCK6
+   q1 = _SIMD_SUB(q1, t1); 
+   q2 = _SIMD_SUB(q2, t2);
+   q3 = _SIMD_SUB(q3, t3);
+   q4 = _SIMD_SUB(q4, t4);
+#endif
+
+   _STORE(&q[0],q1);
+   _STORE(&q[offset],q2);
+   _STORE(&q[2*offset],q3);
+   _STORE(&q[3*offset],q4);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+   q3 = _LOAD(&q[ldq+2*offset]);
+   q4 = _LOAD(&q[ldq+3*offset]);
+ 
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_ADD(q1, _SIMD_FMA(y1, h2, x1));
+   q2 = _SIMD_ADD(q2, _SIMD_FMA(y2, h2, x2));
+   q3 = _SIMD_ADD(q3, _SIMD_FMA(y3, h2, x3));
+   q4 = _SIMD_ADD(q4, _SIMD_FMA(y4, h2, x4));
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_ADD(x1, _SIMD_MUL(y1, h2)));
+   q2 = _SIMD_ADD(q2, _SIMD_ADD(x2, _SIMD_MUL(y2, h2)));
+   q3 = _SIMD_ADD(q3, _SIMD_ADD(x3, _SIMD_MUL(y3, h2)));
+   q4 = _SIMD_ADD(q4, _SIMD_ADD(x4, _SIMD_MUL(y4, h2)));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+   _STORE(&q[ldq+2*offset],q3);
+   _STORE(&q[ldq+3*offset],q4);
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+   q3 = _LOAD(&q[ldq+2*offset]);
+   q4 = _LOAD(&q[ldq+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_SUB(q1, _SIMD_FMA(w1, h4, z1));
+   q2 = _SIMD_SUB(q2, _SIMD_FMA(w2, h4, z2));
+   q3 = _SIMD_SUB(q3, _SIMD_FMA(w3, h4, z3));
+   q4 = _SIMD_SUB(q4, _SIMD_FMA(w4, h4, z4));
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_ADD(z1, _SIMD_MUL(w1, h4)));
+   q2 = _SIMD_SUB(q2, _SIMD_ADD(z2, _SIMD_MUL(w2, h4)));
+   q3 = _SIMD_SUB(q3, _SIMD_ADD(z3, _SIMD_MUL(w3, h4)));
+   q4 = _SIMD_SUB(q4, _SIMD_ADD(z4, _SIMD_MUL(w4, h4)));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+   _STORE(&q[ldq+2*offset],q3);
+   _STORE(&q[ldq+3*offset],q4);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q3 = _LOAD(&q[(ldq*2)+2*offset]);
+   q4 = _LOAD(&q[(ldq*2)+3*offset]);
+
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+   q3 = _SIMD_SUB(q3, y3);
+   q4 = _SIMD_SUB(q4, y4);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+   _STORE(&q[(ldq*2)+2*offset],q3);
+   _STORE(&q[(ldq*2)+3*offset],q4);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q3 = _LOAD(&q[(ldq*3)+2*offset]);
+   q4 = _LOAD(&q[(ldq*3)+3*offset]);
+
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+   q3 = _SIMD_SUB(q3, x3);
+   q4 = _SIMD_SUB(q4, x4);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*3], q1);
+   _STORE(&q[(ldq*3)+offset], q2);
+   _STORE(&q[(ldq*3)+2*offset], q3);
+   _STORE(&q[(ldq*3)+3*offset], q4);
+
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[(ldq+offset)]);
+   q3 = _LOAD(&q[(ldq+2*offset)]);
+   q4 = _LOAD(&q[(ldq+3*offset)]);
+
+   q1 = _SIMD_SUB(q1, v1);
+   q2 = _SIMD_SUB(q2, v2);
+   q3 = _SIMD_SUB(q3, v3);
+   q4 = _SIMD_SUB(q4, v4);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+#endif
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[(ldq+offset)],q2);
+   _STORE(&q[(ldq+2*offset)],q3);
+   _STORE(&q[(ldq+3*offset)],q4);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q3 = _LOAD(&q[(ldq*2)+2*offset]);
+   q4 = _LOAD(&q[(ldq*2)+3*offset]);
+ 
+   q1 = _SIMD_SUB(q1, w1); 
+   q2 = _SIMD_SUB(q2, w2);
+   q3 = _SIMD_SUB(q3, w3);
+   q4 = _SIMD_SUB(q4, w4);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); 
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));  
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));  
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+   _STORE(&q[(ldq*2)+2*offset],q3);
+   _STORE(&q[(ldq*2)+3*offset],q4);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q3 = _LOAD(&q[(ldq*3)+2*offset]);
+   q4 = _LOAD(&q[(ldq*3)+3*offset]);
+
+   q1 = _SIMD_SUB(q1, z1);
+   q2 = _SIMD_SUB(q2, z2);
+   q3 = _SIMD_SUB(q3, z3);
+   q4 = _SIMD_SUB(q4, z4);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+#endif
+
+   _STORE(&q[ldq*3],q1);
+   _STORE(&q[(ldq*3)+offset],q2);
+   _STORE(&q[(ldq*3)+2*offset],q3);
+   _STORE(&q[(ldq*3)+3*offset],q4);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*4]);
+   q2 = _LOAD(&q[(ldq*4)+offset]);
+   q3 = _LOAD(&q[(ldq*4)+2*offset]);
+   q4 = _LOAD(&q[(ldq*4)+3*offset]);
+
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+   q3 = _SIMD_SUB(q3, y3);
+   q4 = _SIMD_SUB(q4, y4);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+#endif
+
+   _STORE(&q[ldq*4],q1);
+   _STORE(&q[(ldq*4)+offset],q2);
+   _STORE(&q[(ldq*4)+2*offset],q3);
+   _STORE(&q[(ldq*4)+3*offset],q4);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[(ldh)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[(ldh)+1], hh[(ldh)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[(ldh)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*5]);
+   q2 = _LOAD(&q[(ldq*5)+offset]);
+   q3 = _LOAD(&q[(ldq*5)+2*offset]);
+   q4 = _LOAD(&q[(ldq*5)+3*offset]);
+
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+   q3 = _SIMD_SUB(q3, x3);
+   q4 = _SIMD_SUB(q4, x4);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+   q4 = _SIMD_NFMA(t4, h6, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+#endif
+
+   _STORE(&q[ldq*5],q1);
+   _STORE(&q[(ldq*5)+offset],q2);
+   _STORE(&q[(ldq*5)+2*offset],q3);
+   _STORE(&q[(ldq*5)+3*offset],q4);
+
+#endif /* BLOCK6 */
+
+   for (i = BLOCK; i < nb; i++)
+   {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+    h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif
+
+     q1 = _LOAD(&q[i*ldq]);
+     q2 = _LOAD(&q[(i*ldq)+offset]);
+     q3 = _LOAD(&q[(i*ldq)+2*offset]);
+     q4 = _LOAD(&q[(i*ldq)+3*offset]);
+
+#ifdef BLOCK2
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_FMA(x1, h1, q1);
+     q1 = _SIMD_FMA(y1, h2, q1);
+     q2 = _SIMD_FMA(x2, h1, q2);
+     q2 = _SIMD_FMA(y2, h2, q2);
+     q3 = _SIMD_FMA(x3, h1, q3);
+     q3 = _SIMD_FMA(y3, h2, q3);
+     q4 = _SIMD_FMA(x4, h1, q4);
+     q4 = _SIMD_FMA(y4, h2, q4);
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADD(_SIMD_MUL(x1,h1), _SIMD_MUL(y1, h2)));
+     q2 = _SIMD_ADD(q2, _SIMD_ADD(_SIMD_MUL(x2,h1), _SIMD_MUL(y2, h2)));
+     q3 = _SIMD_ADD(q3, _SIMD_ADD(_SIMD_MUL(x3,h1), _SIMD_MUL(y3, h2)));
+     q4 = _SIMD_ADD(q4, _SIMD_ADD(_SIMD_MUL(x4,h1), _SIMD_MUL(y4, h2)));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+     
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(x1, h1, q1);
+     q2 = _SIMD_NFMA(x2, h1, q2);
+     q3 = _SIMD_NFMA(x3, h1, q3);
+     q4 = _SIMD_NFMA(x4, h1, q4);
+#else   
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(x1,h1));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(x2,h1));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(x3,h1));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(x4,h1));
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(y1, h2, q1);
+     q2 = _SIMD_NFMA(y2, h2, q2);
+     q3 = _SIMD_NFMA(y3, h2, q3);
+     q4 = _SIMD_NFMA(y4, h2, q4);
+#else    
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(y1,h2));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(y2,h2));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(y3,h2));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(y4,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(z1, h3, q1);
+     q2 = _SIMD_NFMA(z2, h3, q2);
+     q3 = _SIMD_NFMA(z3, h3, q3);
+     q4 = _SIMD_NFMA(z4, h3, q4);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(z1,h3));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(z2,h3));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(z3,h3));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(z4,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); 
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(w1, h4, q1);
+     q2 = _SIMD_NFMA(w2, h4, q2);
+     q3 = _SIMD_NFMA(w3, h4, q3);
+     q4 = _SIMD_NFMA(w4, h4, q4);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(w1,h4));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(w2,h4));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(w3,h4));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(w4,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6*/
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == AVX_256
+     h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(v1, h5, q1);
+     q2 = _SIMD_NFMA(v2, h5, q2);
+     q3 = _SIMD_NFMA(v3, h5, q3);
+     q4 = _SIMD_NFMA(v4, h5, q4);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+#endif /* __ELPA_USE_FMA__ */
+ 
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == AVX_256
+     h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(t1, h6, q1);
+     q2 = _SIMD_NFMA(t2, h6, q2);
+     q3 = _SIMD_NFMA(t3, h6, q3);
+     q4 = _SIMD_NFMA(t4, h6, q4);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+     q4 = _SIMD_SUB(q4, _SIMD_MUL(t4, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+     _STORE(&q[i*ldq],q1);
+     _STORE(&q[(i*ldq)+offset],q2);
+     _STORE(&q[(i*ldq)+2*offset],q3);
+     _STORE(&q[(i*ldq)+3*offset],q4);
+
+   }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+   q1 = _LOAD(&q[nb*ldq]);
+   q2 = _LOAD(&q[(nb*ldq)+offset]);
+   q3 = _LOAD(&q[(nb*ldq)+2*offset]);
+   q4 = _LOAD(&q[(nb*ldq)+3*offset]);
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_FMA(x1, h1, q1);
+   q2 = _SIMD_FMA(x2, h1, q2);
+   q3 = _SIMD_FMA(x3, h1, q3);
+   q4 = _SIMD_FMA(x4, h1, q4);
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_ADD(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_ADD(q4, _SIMD_MUL(x4, h1));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+#else   
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+#endif
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+   q4 = _SIMD_NFMA(v4, h5, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(v4, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[nb*ldq],q1);
+   _STORE(&q[(nb*ldq)+offset],q2);
+   _STORE(&q[(nb*ldq)+2*offset],q3);
+   _STORE(&q[(nb*ldq)+3*offset],q4);
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+  
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+1)*ldq]);
+   q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+1)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+#else 
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+   q4 = _SIMD_NFMA(w4, h4, q4);
+#else   
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(w4, h4));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+1)*ldq],q1);
+   _STORE(&q[((nb+1)*ldq)+offset],q2);
+   _STORE(&q[((nb+1)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+1)*ldq)+3*offset],q4);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+2)*ldq]);
+   q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+2)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+   q4 = _SIMD_NFMA(z4, h3, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(z4, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+   _STORE(&q[(nb+2)*ldq],q1);
+   _STORE(&q[((nb+2)*ldq)+offset],q2);
+   _STORE(&q[((nb+2)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+2)*ldq)+3*offset],q4);
+
+#endif /* BLOCK4 || BLOCK6*/
+
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+   q1 = _LOAD(&q[(nb+3)*ldq]);
+   q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+3)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+3)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+   q4 = _SIMD_NFMA(y4, h2, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(y4, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+3)*ldq],q1);
+   _STORE(&q[((nb+3)*ldq)+offset],q2);
+   _STORE(&q[((nb+3)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+3)*ldq)+3*offset],q4);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+   q1 = _LOAD(&q[(nb+4)*ldq]);
+   q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+4)*ldq)+2*offset]);
+   q4 = _LOAD(&q[((nb+4)*ldq)+3*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+   q4 = _SIMD_NFMA(x4, h1, q4);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+   q4 = _SIMD_SUB(q4, _SIMD_MUL(x4, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+4)*ldq],q1);
+   _STORE(&q[((nb+4)*ldq)+offset],q2);
+   _STORE(&q[((nb+4)*ldq)+2*offset],q3);
+   _STORE(&q[((nb+4)*ldq)+3*offset],q4);
+
+#endif /* BLOCK6 */
+}
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 6
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 12
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 12
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 24
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 48
+#endif
+#endif /* VEC_SET == AVX_512 */
+/*
+ * Unrolled kernel that computes
+ * ROW_LENGTH rows of Q simultaneously, a
+ * matrix Vector product with two householder
+ */
+#ifdef BLOCK2
+/*
+ * vectors + a rank 2 update is performed
+ */
+#endif
+#ifdef BLOCK4
+/*
+ * vectors + a rank 1 update is performed
+ */
+#endif
+
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh,
+#ifdef BLOCK2
+               DATA_TYPE s)
+#endif
+#ifdef BLOCK4
+               DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4)
+#endif
+#ifdef BLOCK6
+               DATA_TYPE_PTR scalarprods)
+#endif
+  {
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ ROW_LENGTH x nb+1] * hh
+    // hh contains two householder vectors, with offset 1
+    /////////////////////////////////////////////////////
+#endif
+#if defined(BLOCK4) || defined(BLOCK6)
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ ROW_LENGTH x nb+3] * hh
+    // hh contains four householder vectors
+    /////////////////////////////////////////////////////
+#endif
+
+    int i;
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 
+    // Needed bit mask for floating point sign flip
+#ifdef DOUBLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == VSX_SSE
+    __SIMD_DATATYPE sign = vec_splats(-1.0);
+#endif
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+    __SIMD_DATATYPE x1 = _LOAD(&q[ldq]);
+    __SIMD_DATATYPE x2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE x3 = _LOAD(&q[ldq+2*offset]);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h1 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+    __SIMD_DATATYPE h2;
+
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE q3 = _LOAD(&q[2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    __SIMD_DATATYPE y1 = _SIMD_FMA(x1, h1, q1);
+    __SIMD_DATATYPE y2 = _SIMD_FMA(x2, h1, q2);
+    __SIMD_DATATYPE y3 = _SIMD_FMA(x3, h1, q3);
+#else
+    __SIMD_DATATYPE y1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+    __SIMD_DATATYPE y2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+    __SIMD_DATATYPE y3 = _SIMD_ADD(q3, _SIMD_MUL(x3, h1));
+#endif
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+    register __SIMD_DATATYPE x1 = a1_1;
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));                          
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));                          
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));                          
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1));
+    register __SIMD_DATATYPE x1 = a1_1;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*3)+offset]);                  
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[0+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+    register __SIMD_DATATYPE x2 = a1_2;
+#else
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+    register __SIMD_DATATYPE x2 = a1_2;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_3 = _LOAD(&q[(ldq*3)+2*offset]);
+    __SIMD_DATATYPE a2_3 = _LOAD(&q[(ldq*2)+2*offset]);
+    __SIMD_DATATYPE a3_3 = _LOAD(&q[ldq+2*offset]);
+    __SIMD_DATATYPE a4_3 = _LOAD(&q[0+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
+    w3 = _SIMD_FMA(a2_3, h_4_2, w3);
+    w3 = _SIMD_FMA(a1_3, h_4_1, w3);
+    register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
+    z3 = _SIMD_FMA(a1_3, h_3_1, z3);
+    register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
+    register __SIMD_DATATYPE x3 = a1_3;
+#else
+    register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
+    register __SIMD_DATATYPE z3 = _SIMD_ADD(a3_3, _SIMD_MUL(a2_3, h_3_2));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
+    register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
+    register __SIMD_DATATYPE x3 = a1_3;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+    __SIMD_DATATYPE q3;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+    
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*5]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*4]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a6_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET1(hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_6_5 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t1 = _SIMD_FMA(a5_1, h_6_5, a6_1);
+    t1 = _SIMD_FMA(a4_1, h_6_4, t1);
+    t1 = _SIMD_FMA(a3_1, h_6_3, t1);
+    t1 = _SIMD_FMA(a2_1, h_6_2, t1);
+    t1 = _SIMD_FMA(a1_1, h_6_1, t1);
+#else
+    register __SIMD_DATATYPE t1 = _SIMD_ADD(a6_1, _SIMD_MUL(a5_1, h_6_5)); 
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a4_1, h_6_4));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a3_1, h_6_3));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a2_1, h_6_2));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_5_4 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE v1 = _SIMD_FMA(a4_1, h_5_4, a5_1);
+    v1 = _SIMD_FMA(a3_1, h_5_3, v1);
+    v1 = _SIMD_FMA(a2_1, h_5_2, v1);
+    v1 = _SIMD_FMA(a1_1, h_5_1, v1);
+#else
+    register __SIMD_DATATYPE v1 = _SIMD_ADD(a5_1, _SIMD_MUL(a4_1, h_5_4)); 
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a3_1, h_5_3));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a2_1, h_5_2));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3)); 
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+#else
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1)); 
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x1 = a1_1;
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*5)+offset]);
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*4)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[(ldq*3)+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a5_2 = _LOAD(&q[(ldq)+offset]);
+    __SIMD_DATATYPE a6_2 = _LOAD(&q[offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t2 = _SIMD_FMA(a5_2, h_6_5, a6_2);
+    t2 = _SIMD_FMA(a4_2, h_6_4, t2);
+    t2 = _SIMD_FMA(a3_2, h_6_3, t2);
+    t2 = _SIMD_FMA(a2_2, h_6_2, t2);
+    t2 = _SIMD_FMA(a1_2, h_6_1, t2);
+    register __SIMD_DATATYPE v2 = _SIMD_FMA(a4_2, h_5_4, a5_2);
+    v2 = _SIMD_FMA(a3_2, h_5_3, v2);
+    v2 = _SIMD_FMA(a2_2, h_5_2, v2);
+    v2 = _SIMD_FMA(a1_2, h_5_1, v2);
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+#else
+    register __SIMD_DATATYPE t2 = _SIMD_ADD(a6_2, _SIMD_MUL(a5_2, h_6_5));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a4_2, h_6_4));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a3_2, h_6_3));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a2_2, h_6_2));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a1_2, h_6_1));
+    register __SIMD_DATATYPE v2 = _SIMD_ADD(a5_2, _SIMD_MUL(a4_2, h_5_4));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a3_2, h_5_3));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a2_2, h_5_2));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a1_2, h_5_1));
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x2 = a1_2;
+
+    __SIMD_DATATYPE a1_3 = _LOAD(&q[(ldq*5)+2*offset]);
+    __SIMD_DATATYPE a2_3 = _LOAD(&q[(ldq*4)+2*offset]);
+    __SIMD_DATATYPE a3_3 = _LOAD(&q[(ldq*3)+2*offset]);
+    __SIMD_DATATYPE a4_3 = _LOAD(&q[(ldq*2)+2*offset]);
+    __SIMD_DATATYPE a5_3 = _LOAD(&q[(ldq)+2*offset]);
+    __SIMD_DATATYPE a6_3 = _LOAD(&q[2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t3 = _SIMD_FMA(a5_3, h_6_5, a6_3);
+    t3 = _SIMD_FMA(a4_3, h_6_4, t3);
+    t3 = _SIMD_FMA(a3_3, h_6_3, t3);
+    t3 = _SIMD_FMA(a2_3, h_6_2, t3);
+    t3 = _SIMD_FMA(a1_3, h_6_1, t3);
+    register __SIMD_DATATYPE v3 = _SIMD_FMA(a4_3, h_5_4, a5_3);
+    v3 = _SIMD_FMA(a3_3, h_5_3, v3);
+    v3 = _SIMD_FMA(a2_3, h_5_2, v3);
+    v3 = _SIMD_FMA(a1_3, h_5_1, v3);
+    register __SIMD_DATATYPE w3 = _SIMD_FMA(a3_3, h_4_3, a4_3);
+    w3 = _SIMD_FMA(a2_3, h_4_2, w3);
+    w3 = _SIMD_FMA(a1_3, h_4_1, w3);
+    register __SIMD_DATATYPE z3 = _SIMD_FMA(a2_3, h_3_2, a3_3);
+    z3 = _SIMD_FMA(a1_3, h_3_1, z3);
+    register __SIMD_DATATYPE y3 = _SIMD_FMA(a1_3, h_2_1, a2_3);
+#else
+    register __SIMD_DATATYPE t3 = _SIMD_ADD(a6_3, _SIMD_MUL(a5_3, h_6_5));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a4_3, h_6_4));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a3_3, h_6_3));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a2_3, h_6_2));
+    t3 = _SIMD_ADD(t3, _SIMD_MUL(a1_3, h_6_1));
+    register __SIMD_DATATYPE v3 = _SIMD_ADD(a5_3, _SIMD_MUL(a4_3, h_5_4));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a3_3, h_5_3));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a2_3, h_5_2));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(a1_3, h_5_1));
+    register __SIMD_DATATYPE w3 = _SIMD_ADD(a4_3, _SIMD_MUL(a3_3, h_4_3));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a2_3, h_4_2));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(a1_3, h_4_1));
+    register __SIMD_DATATYPE z3 = _SIMD_ADD(a3_3, _SIMD_MUL(a2_3, h_3_2));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(a1_3, h_3_1));
+    register __SIMD_DATATYPE y3 = _SIMD_ADD(a2_3, _SIMD_MUL(a1_3, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x3 = a1_3;
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+    __SIMD_DATATYPE q3;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+    __SIMD_DATATYPE h5;
+    __SIMD_DATATYPE h6;
+
+#endif /* BLOCK6 */
+
+
+    for(i = BLOCK; i < nb; i++)
+      {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if  VEC_SET == AVX_256
+        h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+        h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif /*   VEC_SET == AVX_256 */
+
+        q1 = _LOAD(&q[i*ldq]);
+        q2 = _LOAD(&q[(i*ldq)+offset]);
+        q3 = _LOAD(&q[(i*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+        x1 = _SIMD_FMA(q1, h1, x1);
+        y1 = _SIMD_FMA(q1, h2, y1);
+        x2 = _SIMD_FMA(q2, h1, x2);
+        y2 = _SIMD_FMA(q2, h2, y2);
+        x3 = _SIMD_FMA(q3, h1, x3);
+        y3 = _SIMD_FMA(q3, h2, y3);
+#else
+        x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+        y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+        x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+        y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+        x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+        y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        z1 = _SIMD_FMA(q1, h3, z1);
+        z2 = _SIMD_FMA(q2, h3, z2);
+        z3 = _SIMD_FMA(q3, h3, z3);
+#else	
+        z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+        z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+        z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        w1 = _SIMD_FMA(q1, h4, w1);
+        w2 = _SIMD_FMA(q2, h4, w2);
+        w3 = _SIMD_FMA(q3, h4, w3);
+#else
+        w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+        w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+        w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+#endif /* __ELPA_USE_FMA__ */
+	
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == AVX_256
+        h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        v1 = _SIMD_FMA(q1, h5, v1);
+        v2 = _SIMD_FMA(q2, h5, v2);
+        v3 = _SIMD_FMA(q3, h5, v3);
+#else
+        v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+        v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+        v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h6 = _SIMD_SET1(hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h6 = _SIMD_SET(hh[(ldh*5)+i-(BLOCK-6)], hh[(ldh*5)+i]-(BLOCK-6));
+#endif
+
+#if VEC_SET == AVX_256
+        h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        t1 = _SIMD_FMA(q1, h6, t1);
+        t2 = _SIMD_FMA(q2, h6, t2);
+        t3 = _SIMD_FMA(q3, h6, t3);
+#else
+        t1 = _SIMD_ADD(t1, _SIMD_MUL(q1,h6));
+        t2 = _SIMD_ADD(t2, _SIMD_MUL(q2,h6));
+        t3 = _SIMD_ADD(t3, _SIMD_MUL(q3,h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+      }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+    q1 = _LOAD(&q[nb*ldq]);
+    q2 = _LOAD(&q[(nb*ldq)+offset]);
+    q3 = _LOAD(&q[(nb*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+    
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[(ldh*1)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 */
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-(BLOCK-4)], hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+    w3 = _SIMD_FMA(q3, h4, w3);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); 
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h5 = _SIMD_SET1(hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h5 = _SIMD_SET(hh[(ldh*4)+nb-(BLOCK-5)], hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    v1 = _SIMD_FMA(q1, h5, v1);
+    v2 = _SIMD_FMA(q2, h5, v2);
+    v3 = _SIMD_FMA(q3, h5, v3);
+#else
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+    v3 = _SIMD_ADD(v3, _SIMD_MUL(q3,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+#endif /* __ELPA_USE_FMA__ */
+ 
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-4)], hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-(BLOCK-5)], hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+ 
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+    w3 = _SIMD_FMA(q3, h4, w3);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+    w3 = _SIMD_ADD(w3, _SIMD_MUL(q3,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-4)], hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-5)], hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+    z3 = _SIMD_FMA(q3, h3, z3);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+    z3 = _SIMD_ADD(z3, _SIMD_MUL(q3,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-4)], hh[nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-4)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+3)*ldq]);
+    q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+3)*ldq)+2*offset]);
+
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-5)], hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+    y3 = _SIMD_FMA(q3, h2, y3);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+    y3 = _SIMD_ADD(y3, _SIMD_MUL(q3,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-5)], hh[nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-5)]);
+#endif
+ 
+    q1 = _LOAD(&q[(nb+4)*ldq]);
+    q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+    q3 = _LOAD(&q[((nb+4)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+    x3 = _SIMD_FMA(q3, h1, x3);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+    x3 = _SIMD_ADD(x3, _SIMD_MUL(q3,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Rank-2 update of Q [ ROW_LENGTH x nb+1]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK4
+    /////////////////////////////////////////////////////
+    // Rank-1 update of Q [ ROW_LENGTH x nb+3]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK6
+    /////////////////////////////////////////////////////
+    // Apply tau, correct wrong calculation using pre-calculated scalar products
+    /////////////////////////////////////////////////////
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]);
+    __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET1(hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET1(hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET1(hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET1(hh[ldh*5]);       
+#endif
+
+#ifdef BLOCK2    
+    __SIMD_DATATYPE vs = _SIMD_SET1(s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(s_1_3);  
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(s_1_4);  
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(s_2_4);  
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET1(scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET1(scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET1(scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET1(scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET1(scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET1(scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET1(scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET1(scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET1(scalarprods[14]);
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE */
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE tau1 = _SIMD_SET(hh[0], hh[0]);
+    __SIMD_DATATYPE tau2 = _SIMD_SET(hh[ldh], hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET(hh[ldh*2], hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET(hh[ldh*3], hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET(hh[ldh*4], hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET(hh[ldh*5], hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2
+    __SIMD_DATATYPE vs = _SIMD_SET(s, s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(s_1_2, s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(s_1_3, s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(s_2_3, s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(s_1_4, s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(s_2_4, s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(s_3_4, s_3_4);
+
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(scalarprods[0], scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(scalarprods[1], scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(scalarprods[2], scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(scalarprods[3], scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(scalarprods[4], scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(scalarprods[5], scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET(scalarprods[6], scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET(scalarprods[7], scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET(scalarprods[8], scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET(scalarprods[9], scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET(scalarprods[10], scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET(scalarprods[11], scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET(scalarprods[12], scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET(scalarprods[13], scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET(scalarprods[14], scalarprods[14]);
+#endif
+#endif /* VEC_SET == SPARC64_SSE */
+
+#if VEC_SET == AVX_256
+   __SIMD_DATATYPE tau1 = _SIMD_BROADCAST(hh);
+   __SIMD_DATATYPE tau2 = _SIMD_BROADCAST(&hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_BROADCAST(&hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_BROADCAST(&hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_BROADCAST(&hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_BROADCAST(&hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2  
+   __SIMD_DATATYPE vs = _SIMD_BROADCAST(&s);
+#endif
+
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_BROADCAST(&scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_BROADCAST(&scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_BROADCAST(&scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_BROADCAST(&scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_BROADCAST(&scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_BROADCAST(&scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_BROADCAST(&scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_BROADCAST(&scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_BROADCAST(&scalarprods[14]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+    h1 = _XOR(tau1, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_NEG(tau1);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau1, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau1;
+#endif
+
+   x1 = _SIMD_MUL(x1, h1);
+   x2 = _SIMD_MUL(x2, h1);
+   x3 = _SIMD_MUL(x3, h1);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+   h1 = _XOR(tau2, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_NEG(tau2);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau2, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+   h2 = _SIMD_MUL(h1, vs);
+
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau2;
+   h2 = _SIMD_MUL(h1, vs_1_2);
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMA(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMA(y2, h1, _SIMD_MUL(x2,h2));
+   y3 = _SIMD_FMA(y3, h1, _SIMD_MUL(x3,h2));
+#else
+   y1 = _SIMD_ADD(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_ADD(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+   y3 = _SIMD_ADD(_SIMD_MUL(y3,h1), _SIMD_MUL(x3,h2));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMSUB(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMSUB(y2, h1, _SIMD_MUL(x2,h2));
+   y3 = _SIMD_FMSUB(y3, h1, _SIMD_MUL(x3,h2));
+#else   
+   y1 = _SIMD_SUB(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_SUB(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+   y3 = _SIMD_SUB(_SIMD_MUL(y3,h1), _SIMD_MUL(x3,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau3;
+   h2 = _SIMD_MUL(h1, vs_1_3);
+   h3 = _SIMD_MUL(h1, vs_2_3);
+
+#ifdef __ELPA_USE_FMA__
+   z1 = _SIMD_FMSUB(z1, h1, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_FMSUB(z2, h1, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)));
+   z3 = _SIMD_FMSUB(z3, h1, _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2)));
+#else
+   z1 = _SIMD_SUB(_SIMD_MUL(z1,h1), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_SUB(_SIMD_MUL(z2,h1), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)));
+   z3 = _SIMD_SUB(_SIMD_MUL(z3,h1), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2)));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau4;
+   h2 = _SIMD_MUL(h1, vs_1_4);
+   h3 = _SIMD_MUL(h1, vs_2_4);
+   h4 = _SIMD_MUL(h1, vs_3_4);
+
+#ifdef __ELPA_USE_FMA__
+   w1 = _SIMD_FMSUB(w1, h1, _SIMD_FMA(z1, h4, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   w2 = _SIMD_FMSUB(w2, h1, _SIMD_FMA(z2, h4, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+   w3 = _SIMD_FMSUB(w3, h1, _SIMD_FMA(z3, h4, _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2))));
+#else
+   w1 = _SIMD_SUB(_SIMD_MUL(w1,h1), _SIMD_ADD(_SIMD_MUL(z1,h4), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))); 
+   w2 = _SIMD_SUB(_SIMD_MUL(w2,h1), _SIMD_ADD(_SIMD_MUL(z2,h4), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+   w3 = _SIMD_SUB(_SIMD_MUL(w3,h1), _SIMD_ADD(_SIMD_MUL(z3,h4), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+   h2 = _SIMD_MUL(tau5, vs_1_5); 
+   h3 = _SIMD_MUL(tau5, vs_2_5);
+   h4 = _SIMD_MUL(tau5, vs_3_5);
+   h5 = _SIMD_MUL(tau5, vs_4_5);
+
+#ifdef __ELPA_USE_FMA__
+   v1 = _SIMD_FMSUB(v1, tau5, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_FMSUB(v2, tau5, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+   v3 = _SIMD_FMSUB(v3, tau5, _SIMD_ADD(_SIMD_FMA(w3, h5, _SIMD_MUL(z3,h4)), _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2))));
+#else
+   v1 = _SIMD_SUB(_SIMD_MUL(v1,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_SUB(_SIMD_MUL(v2,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+   v3 = _SIMD_SUB(_SIMD_MUL(v3,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w3,h5), _SIMD_MUL(z3,h4)), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+   h2 = _SIMD_MUL(tau6, vs_1_6);
+   h3 = _SIMD_MUL(tau6, vs_2_6);
+   h4 = _SIMD_MUL(tau6, vs_3_6);
+   h5 = _SIMD_MUL(tau6, vs_4_6);
+   h6 = _SIMD_MUL(tau6, vs_5_6);
+
+#ifdef __ELPA_USE_FMA__
+   t1 = _SIMD_FMSUB(t1, tau6, _SIMD_FMA(v1, h6, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_FMSUB(t2, tau6, _SIMD_FMA(v2, h6, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)))));
+   t3 = _SIMD_FMSUB(t3, tau6, _SIMD_FMA(v3, h6, _SIMD_ADD(_SIMD_FMA(w3, h5, _SIMD_MUL(z3,h4)), _SIMD_FMA(y3, h3, _SIMD_MUL(x3,h2)))));
+#else
+   t1 = _SIMD_SUB(_SIMD_MUL(t1,tau6), _SIMD_ADD( _SIMD_MUL(v1,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_SUB(_SIMD_MUL(t2,tau6), _SIMD_ADD( _SIMD_MUL(v2,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)))));
+   t3 = _SIMD_SUB(_SIMD_MUL(t3,tau6), _SIMD_ADD( _SIMD_MUL(v3,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w3,h5), _SIMD_MUL(z3,h4)), _SIMD_ADD(_SIMD_MUL(y3,h3), _SIMD_MUL(x3,h2)))));
+#endif /* __ELPA_USE_FMA__ */
+
+   /////////////////////////////////////////////////////
+   // Rank-1 update of Q [ROW_LENGTH x nb+3]
+   /////////////////////////////////////////////////////
+#endif /* BLOCK6 */
+
+   q1 = _LOAD(&q[0]);
+#ifdef BLOCK2
+   q1 = _SIMD_ADD(q1, y1);
+#endif
+#ifdef BLOCK4
+   q1 = _SIMD_SUB(q1, w1);
+#endif
+#ifdef BLOCK6
+   q1 = _SIMD_SUB(q1, t1); 
+#endif
+   _STORE(&q[0],q1);
+   q2 = _LOAD(&q[offset]);
+#ifdef BLOCK2
+   q2 = _SIMD_ADD(q2, y2);
+#endif
+#ifdef BLOCK4
+   q2 = _SIMD_SUB(q2, w2);
+#endif
+#ifdef BLOCK6
+   q2 = _SIMD_SUB(q2, t2);
+#endif
+   _STORE(&q[offset],q2);
+   q3 = _LOAD(&q[2*offset]);
+#ifdef BLOCK2
+   q3 = _SIMD_ADD(q3, y3);
+#endif
+#ifdef BLOCK4
+   q3 = _SIMD_SUB(q3, w3);
+#endif
+#ifdef BLOCK6
+   q3 = _SIMD_SUB(q3, t3);
+#endif
+
+   _STORE(&q[2*offset],q3);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+   q3 = _LOAD(&q[ldq+2*offset]);
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_ADD(q1, _SIMD_FMA(y1, h2, x1));
+   q2 = _SIMD_ADD(q2, _SIMD_FMA(y2, h2, x2));
+   q3 = _SIMD_ADD(q3, _SIMD_FMA(y3, h2, x3));
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_ADD(x1, _SIMD_MUL(y1, h2)));
+   q2 = _SIMD_ADD(q2, _SIMD_ADD(x2, _SIMD_MUL(y2, h2)));
+   q3 = _SIMD_ADD(q3, _SIMD_ADD(x3, _SIMD_MUL(y3, h2)));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+   _STORE(&q[ldq+2*offset],q3);
+
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+   q3 = _LOAD(&q[ldq+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_SUB(q1, _SIMD_FMA(w1, h4, z1));
+   q2 = _SIMD_SUB(q2, _SIMD_FMA(w2, h4, z2));
+   q3 = _SIMD_SUB(q3, _SIMD_FMA(w3, h4, z3));
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_ADD(z1, _SIMD_MUL(w1, h4)));
+   q2 = _SIMD_SUB(q2, _SIMD_ADD(z2, _SIMD_MUL(w2, h4)));
+   q3 = _SIMD_SUB(q3, _SIMD_ADD(z3, _SIMD_MUL(w3, h4)));
+#endif /* __ELPA_USE_FMA__ */
+ 
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+   _STORE(&q[ldq+2*offset],q3);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q3 = _LOAD(&q[(ldq*2)+2*offset]);
+
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+   q3 = _SIMD_SUB(q3, y3);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+   _STORE(&q[(ldq*2)+2*offset],q3);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q3 = _LOAD(&q[(ldq*3)+2*offset]);
+
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+   q3 = _SIMD_SUB(q3, x3);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*3], q1);
+   _STORE(&q[(ldq*3)+offset], q2);
+   _STORE(&q[(ldq*3)+2*offset], q3);
+
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[(ldq+offset)]);
+   q3 = _LOAD(&q[(ldq+2*offset)]);
+
+   q1 = _SIMD_SUB(q1, v1);
+   q2 = _SIMD_SUB(q2, v2);
+   q3 = _SIMD_SUB(q3, v3);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+#endif /* __ELPA_USE_FMA__ */
+ 
+   _STORE(&q[ldq],q1);
+   _STORE(&q[(ldq+offset)],q2);
+   _STORE(&q[(ldq+2*offset)],q3);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q3 = _LOAD(&q[(ldq*2)+2*offset]);
+
+   q1 = _SIMD_SUB(q1, w1); 
+   q2 = _SIMD_SUB(q2, w2);
+   q3 = _SIMD_SUB(q3, w3);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); 
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+   _STORE(&q[(ldq*2)+2*offset],q3);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+ 
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q3 = _LOAD(&q[(ldq*3)+2*offset]);
+
+   q1 = _SIMD_SUB(q1, z1);
+   q2 = _SIMD_SUB(q2, z2);
+   q3 = _SIMD_SUB(q3, z3);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*3],q1);
+   _STORE(&q[(ldq*3)+offset],q2);
+   _STORE(&q[(ldq*3)+2*offset],q3);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*4]);
+   q2 = _LOAD(&q[(ldq*4)+offset]);
+   q3 = _LOAD(&q[(ldq*4)+2*offset]);
+
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+   q3 = _SIMD_SUB(q3, y3);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*4],q1);
+   _STORE(&q[(ldq*4)+offset],q2);
+   _STORE(&q[(ldq*4)+2*offset],q3);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[(ldh)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[(ldh)+1], hh[(ldh)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[(ldh)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*5]);
+   q2 = _LOAD(&q[(ldq*5)+offset]);
+   q3 = _LOAD(&q[(ldq*5)+2*offset]);
+
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+   q3 = _SIMD_SUB(q3, x3);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+   q3 = _SIMD_NFMA(t3, h6, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*5],q1);
+   _STORE(&q[(ldq*5)+offset],q2);
+   _STORE(&q[(ldq*5)+2*offset],q3);
+
+#endif /* BLOCK6 */
+
+   for (i = BLOCK; i < nb; i++)
+   {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+    h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif
+
+     q1 = _LOAD(&q[i*ldq]);
+     q2 = _LOAD(&q[(i*ldq)+offset]);
+     q3 = _LOAD(&q[(i*ldq)+2*offset]);
+
+#ifdef BLOCK2
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_FMA(x1, h1, q1);
+     q1 = _SIMD_FMA(y1, h2, q1);
+     q2 = _SIMD_FMA(x2, h1, q2);
+     q2 = _SIMD_FMA(y2, h2, q2);
+     q3 = _SIMD_FMA(x3, h1, q3);
+     q3 = _SIMD_FMA(y3, h2, q3);
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADD(_SIMD_MUL(x1,h1), _SIMD_MUL(y1, h2)));
+     q2 = _SIMD_ADD(q2, _SIMD_ADD(_SIMD_MUL(x2,h1), _SIMD_MUL(y2, h2)));
+     q3 = _SIMD_ADD(q3, _SIMD_ADD(_SIMD_MUL(x3,h1), _SIMD_MUL(y3, h2)));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(x1, h1, q1);
+     q2 = _SIMD_NFMA(x2, h1, q2);
+     q3 = _SIMD_NFMA(x3, h1, q3);
+#else     
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(x1,h1));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(x2,h1));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(x3,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(y1, h2, q1);
+     q2 = _SIMD_NFMA(y2, h2, q2);
+     q3 = _SIMD_NFMA(y3, h2, q3);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(y1,h2));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(y2,h2));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(y3,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(z1, h3, q1);
+     q2 = _SIMD_NFMA(z2, h3, q2);
+     q3 = _SIMD_NFMA(z3, h3, q3);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(z1,h3));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(z2,h3));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(z3,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); 
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(w1, h4, q1);
+     q2 = _SIMD_NFMA(w2, h4, q2);
+     q3 = _SIMD_NFMA(w3, h4, q3);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(w1,h4));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(w2,h4));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(w3,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == AVX_256
+     h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(v1, h5, q1);
+     q2 = _SIMD_NFMA(v2, h5, q2);
+     q3 = _SIMD_NFMA(v3, h5, q3);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == AVX_256
+     h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(t1, h6, q1);
+     q2 = _SIMD_NFMA(t2, h6, q2);
+     q3 = _SIMD_NFMA(t3, h6, q3);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+     q3 = _SIMD_SUB(q3, _SIMD_MUL(t3, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+     _STORE(&q[i*ldq],q1);
+     _STORE(&q[(i*ldq)+offset],q2);
+     _STORE(&q[(i*ldq)+2*offset],q3);
+
+   }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+   q1 = _LOAD(&q[nb*ldq]);
+   q2 = _LOAD(&q[(nb*ldq)+offset]);
+   q3 = _LOAD(&q[(nb*ldq)+2*offset]);
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_FMA(x1, h1, q1);
+   q2 = _SIMD_FMA(x2, h1, q2);
+   q3 = _SIMD_FMA(x3, h1, q3);
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_ADD(q3, _SIMD_MUL(x3, h1));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+#else   
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+#endif
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-2]);
+#endif
+
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+   q3 = _SIMD_NFMA(v3, h5, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(v3, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[nb*ldq],q1);
+   _STORE(&q[(nb*ldq)+offset],q2);
+   _STORE(&q[(nb*ldq)+2*offset],q3);
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+   
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+1)*ldq]);
+   q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+1)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+#else 
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+   q3 = _SIMD_NFMA(w3, h4, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(w3, h4));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+1)*ldq],q1);
+   _STORE(&q[((nb+1)*ldq)+offset],q2);
+   _STORE(&q[((nb+1)*ldq)+2*offset],q3);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+2)*ldq]);
+   q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+2)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+   q3 = _SIMD_NFMA(z3, h3, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(z3, h3));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+2)*ldq],q1);
+   _STORE(&q[((nb+2)*ldq)+offset],q2);
+   _STORE(&q[((nb+2)*ldq)+2*offset],q3);
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+   q1 = _LOAD(&q[(nb+3)*ldq]);
+   q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+3)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+   q3 = _SIMD_NFMA(y3, h2, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(y3, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+3)*ldq],q1);
+   _STORE(&q[((nb+3)*ldq)+offset],q2);
+   _STORE(&q[((nb+3)*ldq)+2*offset],q3);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+   q1 = _LOAD(&q[(nb+4)*ldq]);
+   q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+   q3 = _LOAD(&q[((nb+4)*ldq)+2*offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+   q3 = _SIMD_NFMA(x3, h1, q3);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+   q3 = _SIMD_SUB(q3, _SIMD_MUL(x3, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+4)*ldq],q1);
+   _STORE(&q[((nb+4)*ldq)+offset],q2);
+   _STORE(&q[((nb+4)*ldq)+2*offset],q3);
+
+#endif /* BLOCK6 */
+}
+
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 8
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 32
+#endif
+#endif /* VEC_SET == AVX_512 */
+/*
+ * Unrolled kernel that computes
+ * ROW_LENGTH rows of Q simultaneously, a
+ * matrix Vector product with two householder
+ */
+#ifdef BLOCK2
+/*
+ * vectors + a rank 2 update is performed
+ */
+#endif
+#if defined(BLOCK4) || defined(BLOCK6)
+/*
+ * vectors + a rank 1 update is performed
+ */
+#endif
+
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh,
+#ifdef BLOCK2
+               DATA_TYPE s)
+#endif
+#ifdef BLOCK4
+               DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4)
+#endif 
+#ifdef BLOCK6
+               DATA_TYPE_PTR scalarprods)
+#endif 
+  {
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ ROW_LENGTH x nb+1] * hh
+    // hh contains two householder vectors, with offset 1
+    /////////////////////////////////////////////////////
+#endif
+#if defined(BLOCK4) || defined(BLOCK6)
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ ROW_LENGTH x nb+3] * hh
+    // hh contains four householder vectors
+    /////////////////////////////////////////////////////
+#endif
+
+    int i;
+#ifdef BLOCK2
+#if VEC_SET == SSE_128
+    // Needed bit mask for floating point sign flip
+#ifdef DOUBLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == VSX_SSE
+    __SIMD_DATATYPE sign = vec_splats(-1.0);
+#endif
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+    __SIMD_DATATYPE x1 = _LOAD(&q[ldq]);
+    __SIMD_DATATYPE x2 = _LOAD(&q[ldq+offset]);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h1 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+    __SIMD_DATATYPE h2;
+#ifdef __ELPA_USE_FMA__
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_FMA(x1, h1, q1);
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE y2 = _SIMD_FMA(x2, h1, q2);
+#else
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+    __SIMD_DATATYPE q2 = _LOAD(&q[offset]);
+    __SIMD_DATATYPE y2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+#endif
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+    register __SIMD_DATATYPE x1 = a1_1;
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));                          
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));                          
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));                          
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1));
+    register __SIMD_DATATYPE x1 = a1_1;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*3)+offset]);                  
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[ldq+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[0+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+    register __SIMD_DATATYPE x2 = a1_2;
+#else
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+    register __SIMD_DATATYPE x2 = a1_2;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+    
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*5]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*4]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a6_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET1(hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_6_5 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t1 = _SIMD_FMA(a5_1, h_6_5, a6_1);
+    t1 = _SIMD_FMA(a4_1, h_6_4, t1);
+    t1 = _SIMD_FMA(a3_1, h_6_3, t1);
+    t1 = _SIMD_FMA(a2_1, h_6_2, t1);
+    t1 = _SIMD_FMA(a1_1, h_6_1, t1);
+#else
+    register __SIMD_DATATYPE t1 = _SIMD_ADD(a6_1, _SIMD_MUL(a5_1, h_6_5)); 
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a4_1, h_6_4));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a3_1, h_6_3));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a2_1, h_6_2));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_5_4 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE v1 = _SIMD_FMA(a4_1, h_5_4, a5_1);
+    v1 = _SIMD_FMA(a3_1, h_5_3, v1);
+    v1 = _SIMD_FMA(a2_1, h_5_2, v1);
+    v1 = _SIMD_FMA(a1_1, h_5_1, v1);
+#else
+    register __SIMD_DATATYPE v1 = _SIMD_ADD(a5_1, _SIMD_MUL(a4_1, h_5_4)); 
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a3_1, h_5_3));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a2_1, h_5_2));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3)); 
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+#else
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1)); 
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x1 = a1_1;
+
+    __SIMD_DATATYPE a1_2 = _LOAD(&q[(ldq*5)+offset]);
+    __SIMD_DATATYPE a2_2 = _LOAD(&q[(ldq*4)+offset]);
+    __SIMD_DATATYPE a3_2 = _LOAD(&q[(ldq*3)+offset]);
+    __SIMD_DATATYPE a4_2 = _LOAD(&q[(ldq*2)+offset]);
+    __SIMD_DATATYPE a5_2 = _LOAD(&q[(ldq)+offset]);
+    __SIMD_DATATYPE a6_2 = _LOAD(&q[offset]);
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t2 = _SIMD_FMA(a5_2, h_6_5, a6_2);
+    t2 = _SIMD_FMA(a4_2, h_6_4, t2);
+    t2 = _SIMD_FMA(a3_2, h_6_3, t2);
+    t2 = _SIMD_FMA(a2_2, h_6_2, t2);
+    t2 = _SIMD_FMA(a1_2, h_6_1, t2);
+    register __SIMD_DATATYPE v2 = _SIMD_FMA(a4_2, h_5_4, a5_2);
+    v2 = _SIMD_FMA(a3_2, h_5_3, v2);
+    v2 = _SIMD_FMA(a2_2, h_5_2, v2);
+    v2 = _SIMD_FMA(a1_2, h_5_1, v2);
+    register __SIMD_DATATYPE w2 = _SIMD_FMA(a3_2, h_4_3, a4_2);
+    w2 = _SIMD_FMA(a2_2, h_4_2, w2);
+    w2 = _SIMD_FMA(a1_2, h_4_1, w2);
+    register __SIMD_DATATYPE z2 = _SIMD_FMA(a2_2, h_3_2, a3_2);
+    z2 = _SIMD_FMA(a1_2, h_3_1, z2);
+    register __SIMD_DATATYPE y2 = _SIMD_FMA(a1_2, h_2_1, a2_2);
+#else
+    register __SIMD_DATATYPE t2 = _SIMD_ADD(a6_2, _SIMD_MUL(a5_2, h_6_5));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a4_2, h_6_4));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a3_2, h_6_3));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a2_2, h_6_2));
+    t2 = _SIMD_ADD(t2, _SIMD_MUL(a1_2, h_6_1));
+    register __SIMD_DATATYPE v2 = _SIMD_ADD(a5_2, _SIMD_MUL(a4_2, h_5_4));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a3_2, h_5_3));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a2_2, h_5_2));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(a1_2, h_5_1));
+    register __SIMD_DATATYPE w2 = _SIMD_ADD(a4_2, _SIMD_MUL(a3_2, h_4_3));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a2_2, h_4_2));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(a1_2, h_4_1));
+    register __SIMD_DATATYPE z2 = _SIMD_ADD(a3_2, _SIMD_MUL(a2_2, h_3_2));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(a1_2, h_3_1));
+    register __SIMD_DATATYPE y2 = _SIMD_ADD(a2_2, _SIMD_MUL(a1_2, h_2_1));
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x2 = a1_2;
+
+    __SIMD_DATATYPE q1;
+    __SIMD_DATATYPE q2;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+    __SIMD_DATATYPE h5;
+    __SIMD_DATATYPE h6;
+
+#endif /* BLOCK6 */
+
+    for(i = BLOCK; i < nb; i++)
+      {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if  VEC_SET == AVX_256
+        h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+        h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif /*   VEC_SET == AVX_256 */
+
+        q1 = _LOAD(&q[i*ldq]);
+        q2 = _LOAD(&q[(i*ldq)+offset]);
+#ifdef __ELPA_USE_FMA__
+        x1 = _SIMD_FMA(q1, h1, x1);
+        y1 = _SIMD_FMA(q1, h2, y1);
+        x2 = _SIMD_FMA(q2, h1, x2);
+        y2 = _SIMD_FMA(q2, h2, y2);
+#else
+        x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+        y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+        x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+        y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        z1 = _SIMD_FMA(q1, h3, z1);
+        z2 = _SIMD_FMA(q2, h3, z2);
+#else	
+        z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+        z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        w1 = _SIMD_FMA(q1, h4, w1);
+        w2 = _SIMD_FMA(q2, h4, w2);
+#else
+        w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+        w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == AVX_256
+        h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        v1 = _SIMD_FMA(q1, h5, v1);
+        v2 = _SIMD_FMA(q2, h5, v2);
+#else
+        v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+        v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h6 = _SIMD_SET1(hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        t1 = _SIMD_FMA(q1, h6, t1);
+        t2 = _SIMD_FMA(q2, h6, t2);
+#else
+        t1 = _SIMD_ADD(t1, _SIMD_MUL(q1,h6));
+        t2 = _SIMD_ADD(t2, _SIMD_MUL(q2,h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+      }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+    q1 = _LOAD(&q[nb*ldq]);
+    q2 = _LOAD(&q[(nb*ldq)+offset]);
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+    
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#ifdef BLOCK4
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[(ldh*1)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 */
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-(BLOCK-4)], hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); 
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    v1 = _SIMD_FMA(q1, h5, v1);
+    v2 = _SIMD_FMA(q2, h5, v2);
+#else
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+    v2 = _SIMD_ADD(v2, _SIMD_MUL(q2,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+    q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-4)], hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-(BLOCK-5)], hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+    w2 = _SIMD_FMA(q2, h4, w2);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+    w2 = _SIMD_ADD(w2, _SIMD_MUL(q2,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+    q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-4)], hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-5)], hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+    z2 = _SIMD_FMA(q2, h3, z2);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+    z2 = _SIMD_ADD(z2, _SIMD_MUL(q2,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-4)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+3)*ldq]);
+    q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+    y2 = _SIMD_FMA(q2, h2, y2);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+    y2 = _SIMD_ADD(y2, _SIMD_MUL(q2,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-5)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+4)*ldq]);
+    q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+    x2 = _SIMD_FMA(q2, h1, x2);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+    x2 = _SIMD_ADD(x2, _SIMD_MUL(q2,h1));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK6 */
+
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Rank-2 update of Q [ ROW_LENGTH x nb+1]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK4
+    /////////////////////////////////////////////////////
+    // Rank-1 update of Q [ ROW_LENGTH x nb+3]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK6
+    /////////////////////////////////////////////////////
+    // Apply tau, correct wrong calculation using pre-calculated scalar products
+    /////////////////////////////////////////////////////
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]);
+    __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET1(hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET1(hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET1(hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET1(hh[ldh*5]);       
+#endif
+
+#ifdef BLOCK2    
+    __SIMD_DATATYPE vs = _SIMD_SET1(s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(s_1_3);  
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(s_1_4);  
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(s_2_4);  
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET1(scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET1(scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET1(scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET1(scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET1(scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET1(scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET1(scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET1(scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET1(scalarprods[14]);
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE */
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE tau1 = _SIMD_SET(hh[0], hh[0]);
+    __SIMD_DATATYPE tau2 = _SIMD_SET(hh[ldh], hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET(hh[ldh*2], hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET(hh[ldh*3], hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET(hh[ldh*4], hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET(hh[ldh*5], hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2
+    __SIMD_DATATYPE vs = _SIMD_SET(s, s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(s_1_2, s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(s_1_3, s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(s_2_3, s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(s_1_4, s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(s_2_4, s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(s_3_4, s_3_4);
+
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(scalarprods[0], scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(scalarprods[1], scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(scalarprods[2], scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(scalarprods[3], scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(scalarprods[4], scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(scalarprods[5], scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET(scalarprods[6], scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET(scalarprods[7], scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET(scalarprods[8], scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET(scalarprods[9], scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET(scalarprods[10], scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET(scalarprods[11], scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET(scalarprods[12], scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET(scalarprods[13], scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET(scalarprods[14], scalarprods[14]);
+#endif
+#endif /*  VEC_SET == SPARC64_SSE */
+
+#if VEC_SET == AVX_256
+   __SIMD_DATATYPE tau1 = _SIMD_BROADCAST(hh);
+   __SIMD_DATATYPE tau2 = _SIMD_BROADCAST(&hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_BROADCAST(&hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_BROADCAST(&hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_BROADCAST(&hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_BROADCAST(&hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2  
+   __SIMD_DATATYPE vs = _SIMD_BROADCAST(&s);
+#endif
+
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_BROADCAST(&scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_BROADCAST(&scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_BROADCAST(&scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_BROADCAST(&scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_BROADCAST(&scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_BROADCAST(&scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_BROADCAST(&scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_BROADCAST(&scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_BROADCAST(&scalarprods[14]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+    h1 = _XOR(tau1, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_NEG(tau1);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau1, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+#endif  /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau1;
+#endif
+
+   x1 = _SIMD_MUL(x1, h1);
+   x2 = _SIMD_MUL(x2, h1);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+   h1 = _XOR(tau2, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_NEG(tau2);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau2, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+   h2 = _SIMD_MUL(h1, vs);
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau2;
+   h2 = _SIMD_MUL(h1, vs_1_2); 
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMA(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMA(y2, h1, _SIMD_MUL(x2,h2));
+#else
+   y1 = _SIMD_ADD(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_ADD(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMSUB(y1, h1, _SIMD_MUL(x1,h2));
+   y2 = _SIMD_FMSUB(y2, h1, _SIMD_MUL(x2,h2));
+#else   
+   y1 = _SIMD_SUB(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+   y2 = _SIMD_SUB(_SIMD_MUL(y2,h1), _SIMD_MUL(x2,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau3;
+   h2 = _SIMD_MUL(h1, vs_1_3);
+   h3 = _SIMD_MUL(h1, vs_2_3);
+
+#ifdef __ELPA_USE_FMA__
+   z1 = _SIMD_FMSUB(z1, h1, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_FMSUB(z2, h1, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)));
+#else
+   z1 = _SIMD_SUB(_SIMD_MUL(z1,h1), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)));
+   z2 = _SIMD_SUB(_SIMD_MUL(z2,h1), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau4;
+   h2 = _SIMD_MUL(h1, vs_1_4);
+   h3 = _SIMD_MUL(h1, vs_2_4);
+   h4 = _SIMD_MUL(h1, vs_3_4);
+
+#ifdef __ELPA_USE_FMA__
+   w1 = _SIMD_FMSUB(w1, h1, _SIMD_FMA(z1, h4, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   w2 = _SIMD_FMSUB(w2, h1, _SIMD_FMA(z2, h4, _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+#else
+   w1 = _SIMD_SUB(_SIMD_MUL(w1,h1), _SIMD_ADD(_SIMD_MUL(z1,h4), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))); 
+   w2 = _SIMD_SUB(_SIMD_MUL(w2,h1), _SIMD_ADD(_SIMD_MUL(z2,h4), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+   h2 = _SIMD_MUL(tau5, vs_1_5); 
+   h3 = _SIMD_MUL(tau5, vs_2_5);
+   h4 = _SIMD_MUL(tau5, vs_3_5);
+   h5 = _SIMD_MUL(tau5, vs_4_5);
+
+#ifdef __ELPA_USE_FMA__
+   v1 = _SIMD_FMSUB(v1, tau5, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_FMSUB(v2, tau5, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2))));
+#else
+   v1 = _SIMD_SUB(_SIMD_MUL(v1,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+   v2 = _SIMD_SUB(_SIMD_MUL(v2,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+   h2 = _SIMD_MUL(tau6, vs_1_6);
+   h3 = _SIMD_MUL(tau6, vs_2_6);
+   h4 = _SIMD_MUL(tau6, vs_3_6);
+   h5 = _SIMD_MUL(tau6, vs_4_6);
+   h6 = _SIMD_MUL(tau6, vs_5_6);
+
+#ifdef __ELPA_USE_FMA__
+   t1 = _SIMD_FMSUB(t1, tau6, _SIMD_FMA(v1, h6, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_FMSUB(t2, tau6, _SIMD_FMA(v2, h6, _SIMD_ADD(_SIMD_FMA(w2, h5, _SIMD_MUL(z2,h4)), _SIMD_FMA(y2, h3, _SIMD_MUL(x2,h2)))));
+#else
+   t1 = _SIMD_SUB(_SIMD_MUL(t1,tau6), _SIMD_ADD( _SIMD_MUL(v1,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))));
+   t2 = _SIMD_SUB(_SIMD_MUL(t2,tau6), _SIMD_ADD( _SIMD_MUL(v2,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w2,h5), _SIMD_MUL(z2,h4)), _SIMD_ADD(_SIMD_MUL(y2,h3), _SIMD_MUL(x2,h2)))));
+#endif /* __ELPA_USE_FMA__ */
+
+   /////////////////////////////////////////////////////
+   // Rank-1 update of Q [ROW_LENGTH x nb+3]
+   /////////////////////////////////////////////////////
+#endif /* BLOCK6 */
+
+   q1 = _LOAD(&q[0]);
+#ifdef BLOCK2
+   q1 = _SIMD_ADD(q1, y1);
+#endif
+#ifdef BLOCK4
+   q1 = _SIMD_SUB(q1, w1);
+#endif
+#ifdef BLOCK6
+   q1 = _SIMD_SUB(q1, t1); 
+#endif
+   _STORE(&q[0],q1);
+   q2 = _LOAD(&q[offset]);
+#ifdef BLOCK2
+   q2 = _SIMD_ADD(q2, y2);
+#endif
+#ifdef BLOCK4
+   q2 = _SIMD_SUB(q2, w2);
+#endif
+#ifdef BLOCK6
+   q2 = _SIMD_SUB(q2, t2);
+#endif
+   _STORE(&q[offset],q2);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _LOAD(&q[ldq]);
+   q1 = _SIMD_ADD(q1, _SIMD_FMA(y1, h2, x1));
+   _STORE(&q[ldq],q1);
+   q2 = _LOAD(&q[ldq+offset]);
+   q2 = _SIMD_ADD(q2, _SIMD_FMA(y2, h2, x2));
+   _STORE(&q[ldq+offset],q2);
+#else
+   q1 = _LOAD(&q[ldq]);
+   q1 = _SIMD_ADD(q1, _SIMD_ADD(x1, _SIMD_MUL(y1, h2)));
+   _STORE(&q[ldq],q1);
+   q2 = _LOAD(&q[ldq+offset]);
+   q2 = _SIMD_ADD(q2, _SIMD_ADD(x2, _SIMD_MUL(y2, h2)));
+   _STORE(&q[ldq+offset],q2);
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[ldq+offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_SUB(q1, _SIMD_FMA(w1, h4, z1));
+   q2 = _SIMD_SUB(q2, _SIMD_FMA(w2, h4, z2));
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_ADD(z1, _SIMD_MUL(w1, h4)));
+   q2 = _SIMD_SUB(q2, _SIMD_ADD(z2, _SIMD_MUL(w2, h4)));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[ldq+offset],q2);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*3], q1);
+   _STORE(&q[(ldq*3)+offset], q2);
+
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q2 = _LOAD(&q[(ldq+offset)]);
+   q1 = _SIMD_SUB(q1, v1);
+   q2 = _SIMD_SUB(q2, v2);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq],q1);
+   _STORE(&q[(ldq+offset)],q2);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*2]);
+   q2 = _LOAD(&q[(ldq*2)+offset]);
+   q1 = _SIMD_SUB(q1, w1); 
+   q2 = _SIMD_SUB(q2, w2);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); 
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+   _STORE(&q[(ldq*2)+offset],q2);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q2 = _LOAD(&q[(ldq*3)+offset]);
+
+   q1 = _SIMD_SUB(q1, z1);
+   q2 = _SIMD_SUB(q2, z2);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+#endif
+
+   _STORE(&q[ldq*3],q1);
+   _STORE(&q[(ldq*3)+offset],q2);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*4]);
+   q2 = _LOAD(&q[(ldq*4)+offset]);
+
+   q1 = _SIMD_SUB(q1, y1);
+   q2 = _SIMD_SUB(q2, y2);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+#endif
+
+   _STORE(&q[ldq*4],q1);
+   _STORE(&q[(ldq*4)+offset],q2);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[(ldh)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[(ldh)+1], hh[(ldh)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[(ldh)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*5]);
+   q2 = _LOAD(&q[(ldq*5)+offset]);
+   q1 = _SIMD_SUB(q1, x1);
+   q2 = _SIMD_SUB(q2, x2);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+   q2 = _SIMD_NFMA(t2, h6, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+#endif
+   _STORE(&q[ldq*5],q1);
+   _STORE(&q[(ldq*5)+offset],q2);
+
+#endif /* BLOCK6 */
+
+   for (i = BLOCK; i < nb; i++)
+   {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+    h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif
+
+     q1 = _LOAD(&q[i*ldq]);
+     q2 = _LOAD(&q[(i*ldq)+offset]);
+
+#ifdef BLOCK2
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_FMA(x1, h1, q1);
+     q1 = _SIMD_FMA(y1, h2, q1);
+     q2 = _SIMD_FMA(x2, h1, q2);
+     q2 = _SIMD_FMA(y2, h2, q2);
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADD(_SIMD_MUL(x1,h1), _SIMD_MUL(y1, h2)));
+     q2 = _SIMD_ADD(q2, _SIMD_ADD(_SIMD_MUL(x2,h1), _SIMD_MUL(y2, h2)));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(x1, h1, q1);
+     q2 = _SIMD_NFMA(x2, h1, q2);
+#else  
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(x1,h1));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(x2,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(y1, h2, q1);
+     q2 = _SIMD_NFMA(y2, h2, q2);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(y1,h2));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(y2,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(z1, h3, q1);
+     q2 = _SIMD_NFMA(z2, h3, q2);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(z1,h3));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(z2,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); 
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(w1, h4, q1);
+     q2 = _SIMD_NFMA(w2, h4, q2);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(w1,h4));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(w2,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6*/
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h5 = _SIMD_SET1(hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h5 = _SIMD_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
+#endif
+#if VEC_SET == AVX_256
+     h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(v1, h5, q1);
+     q2 = _SIMD_NFMA(v2, h5, q2);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+#if VEC_SET == AVX_256
+     h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(t1, h6, q1);
+     q2 = _SIMD_NFMA(t2, h6, q2);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+     q2 = _SIMD_SUB(q2, _SIMD_MUL(t2, h6));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK6 */
+
+     _STORE(&q[i*ldq],q1);
+     _STORE(&q[(i*ldq)+offset],q2);
+
+   }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+   q1 = _LOAD(&q[nb*ldq]);
+   q2 = _LOAD(&q[(nb*ldq)+offset]);
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_FMA(x1, h1, q1);
+   q2 = _SIMD_FMA(x2, h1, q2);
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_ADD(q2, _SIMD_MUL(x2, h1));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+#else   
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+#endif
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+   q2 = _SIMD_NFMA(v2, h5, q2);
+#else 
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(v2, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[nb*ldq],q1);
+   _STORE(&q[(nb*ldq)+offset],q2);
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+1)*ldq]);
+   q2 = _LOAD(&q[((nb+1)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+   q2 = _SIMD_NFMA(w2, h4, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(w2, h4));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+1)*ldq],q1);
+   _STORE(&q[((nb+1)*ldq)+offset],q2);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+2)*ldq]);
+   q2 = _LOAD(&q[((nb+2)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+   q2 = _SIMD_NFMA(z2, h3, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(z2, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+2)*ldq],q1);
+   _STORE(&q[((nb+2)*ldq)+offset],q2);
+
+#endif /* BLOCK4 || BLOCK6*/
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+   q1 = _LOAD(&q[(nb+3)*ldq]);
+   q2 = _LOAD(&q[((nb+3)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+   q2 = _SIMD_NFMA(y2, h2, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(y2, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+3)*ldq],q1);
+   _STORE(&q[((nb+3)*ldq)+offset],q2);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+   q1 = _LOAD(&q[(nb+4)*ldq]);
+   q2 = _LOAD(&q[((nb+4)*ldq)+offset]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+   q2 = _SIMD_NFMA(x2, h1, q2);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+   q2 = _SIMD_SUB(q2, _SIMD_MUL(x2, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+4)*ldq],q1);
+   _STORE(&q[((nb+4)*ldq)+offset],q2);
+
+#endif /* BLOCK6 */
+
+}
+
+#undef ROW_LENGTH
+#if  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+#ifdef DOUBLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 2
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#undef ROW_LENGTH
+#define ROW_LENGTH 4
+#endif
+#endif /*  VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 4
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+#define ROW_LENGTH 8
+#endif
+#ifdef SINGLE_PRECISION_REAL
+#define ROW_LENGTH 16
+#endif
+#endif /* VEC_SET == AVX_512 */
+
+
+/*
+ * Unrolled kernel that computes
+ * ROW_LENGTH rows of Q simultaneously, a
+ * matrix Vector product with two householder
+ */
+#ifdef BLOCK2
+/*
+ * vectors + a rank 2 update is performed
+ */
+#endif
+#ifdef BLOCK4
+/*
+ * vectors + a rank 1 update is performed
+ */
+#endif
+
+__forceinline void CONCAT_8ARGS(hh_trafo_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq, int ldh,
+#ifdef BLOCK2
+               DATA_TYPE s)
+#endif
+#ifdef BLOCK4
+               DATA_TYPE s_1_2, DATA_TYPE s_1_3, DATA_TYPE s_2_3, DATA_TYPE s_1_4, DATA_TYPE s_2_4, DATA_TYPE s_3_4)
+#endif
+#ifdef BLOCK6
+               DATA_TYPE_PTR scalarprods)
+#endif 
+  {
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ ROW_LENGTH x nb+1] * hh
+    // hh contains two householder vectors, with offset 1
+    /////////////////////////////////////////////////////
+#endif
+#if defined(BLOCK4) || defined(BLOCK6)
+    /////////////////////////////////////////////////////
+    // Matrix Vector Multiplication, Q [ ROW_LENGTH x nb+3] * hh
+    // hh contains four householder vectors
+    /////////////////////////////////////////////////////
+#endif
+
+    int i;
+#ifdef BLOCK2
+#if VEC_SET == SSE_128
+    // Needed bit mask for floating point sign flip
+#ifdef DOUBLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    __SIMD_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+#endif
+#endif /* VEC_SET == SSE_128 */
+
+#if VEC_SET == VSX_SSE
+    __SIMD_DATATYPE sign = vec_splats(-1.0);
+#endif
+
+#if  VEC_SET == AVX_256
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi64x(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#if  VEC_SET == AVX_512
+#ifdef DOUBLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+        __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
+#endif
+#endif /* VEC_SET == AVX_512 */
+    __SIMD_DATATYPE x1 = _LOAD(&q[ldq]);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h1 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h1 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+ 
+    __SIMD_DATATYPE h2;
+#ifdef __ELPA_USE_FMA__
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_FMA(x1, h1, q1);
+#else
+    __SIMD_DATATYPE q1 = _LOAD(q);
+    __SIMD_DATATYPE y1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+#endif
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+    register __SIMD_DATATYPE x1 = a1_1;
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));                          
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));                          
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));                          
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1));
+    register __SIMD_DATATYPE x1 = a1_1;
+#endif /* __ELPA_USE_FMA__ */
+
+    __SIMD_DATATYPE q1;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+    
+    __SIMD_DATATYPE a1_1 = _LOAD(&q[ldq*5]);
+    __SIMD_DATATYPE a2_1 = _LOAD(&q[ldq*4]);
+    __SIMD_DATATYPE a3_1 = _LOAD(&q[ldq*3]);
+    __SIMD_DATATYPE a4_1 = _LOAD(&q[ldq*2]);
+    __SIMD_DATATYPE a5_1 = _LOAD(&q[ldq]);  
+    __SIMD_DATATYPE a6_1 = _LOAD(&q[0]);    
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET1(hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET1(hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET1(hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET1(hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_6_5 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_6_5 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+    __SIMD_DATATYPE h_6_4 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+    __SIMD_DATATYPE h_6_3 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+    __SIMD_DATATYPE h_6_2 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+    __SIMD_DATATYPE h_6_1 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE t1 = _SIMD_FMA(a5_1, h_6_5, a6_1);
+    t1 = _SIMD_FMA(a4_1, h_6_4, t1);
+    t1 = _SIMD_FMA(a3_1, h_6_3, t1);
+    t1 = _SIMD_FMA(a2_1, h_6_2, t1);
+    t1 = _SIMD_FMA(a1_1, h_6_1, t1);
+#else
+    register __SIMD_DATATYPE t1 = _SIMD_ADD(a6_1, _SIMD_MUL(a5_1, h_6_5)); 
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a4_1, h_6_4));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a3_1, h_6_3));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a2_1, h_6_2));
+    t1 = _SIMD_ADD(t1, _SIMD_MUL(a1_1, h_6_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET1(hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET1(hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET1(hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_5_4 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_5_4 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+    __SIMD_DATATYPE h_5_3 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+    __SIMD_DATATYPE h_5_2 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+    __SIMD_DATATYPE h_5_1 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE v1 = _SIMD_FMA(a4_1, h_5_4, a5_1);
+    v1 = _SIMD_FMA(a3_1, h_5_3, v1);
+    v1 = _SIMD_FMA(a2_1, h_5_2, v1);
+    v1 = _SIMD_FMA(a1_1, h_5_1, v1);
+#else
+    register __SIMD_DATATYPE v1 = _SIMD_ADD(a5_1, _SIMD_MUL(a4_1, h_5_4)); 
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a3_1, h_5_3));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a2_1, h_5_2));
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(a1_1, h_5_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET1(hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET1(hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_4_3 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_4_3 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+    __SIMD_DATATYPE h_4_2 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+    __SIMD_DATATYPE h_4_1 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE w1 = _SIMD_FMA(a3_1, h_4_3, a4_1);
+    w1 = _SIMD_FMA(a2_1, h_4_2, w1);
+    w1 = _SIMD_FMA(a1_1, h_4_1, w1);
+#else
+    register __SIMD_DATATYPE w1 = _SIMD_ADD(a4_1, _SIMD_MUL(a3_1, h_4_3)); 
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a2_1, h_4_2));
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(a1_1, h_4_1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET1(hh[ldh+1]);    
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET1(hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE h_2_1 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+    __SIMD_DATATYPE h_2_1 = _SIMD_BROADCAST(&hh[ldh+1]);
+    __SIMD_DATATYPE h_3_2 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+    __SIMD_DATATYPE h_3_1 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    register __SIMD_DATATYPE z1 = _SIMD_FMA(a2_1, h_3_2, a3_1);
+    z1 = _SIMD_FMA(a1_1, h_3_1, z1);
+    register __SIMD_DATATYPE y1 = _SIMD_FMA(a1_1, h_2_1, a2_1);
+#else
+    register __SIMD_DATATYPE z1 = _SIMD_ADD(a3_1, _SIMD_MUL(a2_1, h_3_2));
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(a1_1, h_3_1));
+    register __SIMD_DATATYPE y1 = _SIMD_ADD(a2_1, _SIMD_MUL(a1_1, h_2_1)); 
+#endif /* __ELPA_USE_FMA__ */
+
+    register __SIMD_DATATYPE x1 = a1_1;
+
+    __SIMD_DATATYPE q1;
+
+    __SIMD_DATATYPE h1;
+    __SIMD_DATATYPE h2;
+    __SIMD_DATATYPE h3;
+    __SIMD_DATATYPE h4;
+    __SIMD_DATATYPE h5;
+    __SIMD_DATATYPE h6;
+
+#endif /* BLOCK6 */
+
+    for(i = BLOCK; i < nb; i++)
+      {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+        h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if  VEC_SET == AVX_256
+        h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+        h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif /*   VEC_SET == AVX_256 */
+
+#ifdef __ELPA_USE_FMA__
+        q1 = _LOAD(&q[i*ldq]);
+        x1 = _SIMD_FMA(q1, h1, x1);
+        y1 = _SIMD_FMA(q1, h2, y1);
+#else
+        q1 = _LOAD(&q[i*ldq]);
+        x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+        y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        z1 = _SIMD_FMA(q1, h3, z1);
+#else	
+        z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+        h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        w1 = _SIMD_FMA(q1, h4, w1);
+#else
+        w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+#endif
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h5 = _SIMD_SET1(hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+        h5 = _SIMD_SET(hh[(ldh*4)+i-(BLOCK-5)], hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+#if VEC_SET == AVX_256
+        h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        v1 = _SIMD_FMA(q1, h5, v1);
+#else
+        v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+        h6 = _SIMD_SET1(hh[(ldh*5)+i]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+        h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
+#endif
+
+#if VEC_SET == AVX_256
+        h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+        t1 = _SIMD_FMA(q1, h6, t1);
+#else
+        t1 = _SIMD_ADD(t1, _SIMD_MUL(q1,h6));
+#endif /* __ELPA_USE_FMA__ */	
+
+#endif /* BLOCK6 */
+      }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    q1 = _LOAD(&q[nb*ldq]);
+    x1 = _SIMD_FMA(q1, h1, x1);
+#else
+    q1 = _LOAD(&q[nb*ldq]);
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+    
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+#endif
+
+#ifdef BLOCK4
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[(ldh*1)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK4 */
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4)); 
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h5 = _SIMD_SET1(hh[(ldh*4)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h5 = _SIMD_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    v1 = _SIMD_FMA(q1, h5, v1);
+#else
+    v1 = _SIMD_ADD(v1, _SIMD_MUL(q1,h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-4], hh[nb-4]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+1)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-3], hh[ldh+nb-3]);
+#endif
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
+#endif
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h4 = _SIMD_SET1(hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h4 = _SIMD_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+    h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    w1 = _SIMD_FMA(q1, h4, w1);
+#else
+    w1 = _SIMD_ADD(w1, _SIMD_MUL(q1,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-3], hh[nb-3]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-3]);
+#endif
+
+    q1 = _LOAD(&q[(nb+2)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    z1 = _SIMD_FMA(q1, h3, z1);
+#else
+    z1 = _SIMD_ADD(z1, _SIMD_MUL(q1,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-4)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+3)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+    y1 = _SIMD_FMA(q1, h2, y1);
+#else
+    y1 = _SIMD_ADD(y1, _SIMD_MUL(q1,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+    h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-5)]);
+#endif
+
+    q1 = _LOAD(&q[(nb+4)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+    x1 = _SIMD_FMA(q1, h1, x1);
+#else
+    x1 = _SIMD_ADD(x1, _SIMD_MUL(q1,h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+#ifdef BLOCK2
+    /////////////////////////////////////////////////////
+    // Rank-2 update of Q [ ROW_LENGTH x nb+1]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK4
+    /////////////////////////////////////////////////////
+    // Rank-1 update of Q [ ROW_LENGTH x nb+3]
+    /////////////////////////////////////////////////////
+#endif
+#ifdef BLOCK6
+    /////////////////////////////////////////////////////
+    // Apply tau, correct wrong calculation using pre-calculated scalar products
+    /////////////////////////////////////////////////////
+#endif
+
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+    __SIMD_DATATYPE tau1 = _SIMD_SET1(hh[0]);
+    __SIMD_DATATYPE tau2 = _SIMD_SET1(hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET1(hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET1(hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET1(hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET1(hh[ldh*5]);       
+#endif
+
+#ifdef BLOCK2    
+    __SIMD_DATATYPE vs = _SIMD_SET1(s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(s_1_3);  
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(s_1_4);  
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(s_2_4);  
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET1(scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET1(scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET1(scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET1(scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET1(scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET1(scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET1(scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET1(scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET1(scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET1(scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET1(scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET1(scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET1(scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET1(scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET1(scalarprods[14]);
+#endif
+#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE */
+
+#if VEC_SET == SPARC64_SSE
+    __SIMD_DATATYPE tau1 = _SIMD_SET(hh[0], hh[0]);
+    __SIMD_DATATYPE tau2 = _SIMD_SET(hh[ldh], hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_SET(hh[ldh*2], hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_SET(hh[ldh*3], hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_SET(hh[ldh*4], hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_SET(hh[ldh*5], hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2
+    __SIMD_DATATYPE vs = _SIMD_SET(s, s);
+#endif
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(s_1_2, s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(s_1_3, s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(s_2_3, s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(s_1_4, s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(s_2_4, s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(s_3_4, s_3_4);
+
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_SET(scalarprods[0], scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_SET(scalarprods[1], scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_SET(scalarprods[2], scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_SET(scalarprods[3], scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_SET(scalarprods[4], scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_SET(scalarprods[5], scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_SET(scalarprods[6], scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_SET(scalarprods[7], scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_SET(scalarprods[8], scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_SET(scalarprods[9], scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_SET(scalarprods[10], scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_SET(scalarprods[11], scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_SET(scalarprods[12], scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_SET(scalarprods[13], scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_SET(scalarprods[14], scalarprods[14]);
+#endif
+#endif /* VEC_SET == SPARC64_SSE */
+
+#if VEC_SET == AVX_256
+   __SIMD_DATATYPE tau1 = _SIMD_BROADCAST(hh);
+   __SIMD_DATATYPE tau2 = _SIMD_BROADCAST(&hh[ldh]);
+#if defined(BLOCK4) || defined(BLOCK6)
+   __SIMD_DATATYPE tau3 = _SIMD_BROADCAST(&hh[ldh*2]);
+   __SIMD_DATATYPE tau4 = _SIMD_BROADCAST(&hh[ldh*3]);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE tau5 = _SIMD_BROADCAST(&hh[ldh*4]);
+   __SIMD_DATATYPE tau6 = _SIMD_BROADCAST(&hh[ldh*5]);
+#endif
+
+#ifdef BLOCK2  
+   __SIMD_DATATYPE vs = _SIMD_BROADCAST(&s);
+#endif
+
+#ifdef BLOCK4
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&s_1_2);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&s_1_3);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&s_2_3);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&s_1_4);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&s_2_4);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&s_3_4);
+#endif
+#ifdef BLOCK6
+   __SIMD_DATATYPE vs_1_2 = _SIMD_BROADCAST(&scalarprods[0]);
+   __SIMD_DATATYPE vs_1_3 = _SIMD_BROADCAST(&scalarprods[1]);
+   __SIMD_DATATYPE vs_2_3 = _SIMD_BROADCAST(&scalarprods[2]);
+   __SIMD_DATATYPE vs_1_4 = _SIMD_BROADCAST(&scalarprods[3]);
+   __SIMD_DATATYPE vs_2_4 = _SIMD_BROADCAST(&scalarprods[4]);
+   __SIMD_DATATYPE vs_3_4 = _SIMD_BROADCAST(&scalarprods[5]);
+   __SIMD_DATATYPE vs_1_5 = _SIMD_BROADCAST(&scalarprods[6]);
+   __SIMD_DATATYPE vs_2_5 = _SIMD_BROADCAST(&scalarprods[7]);
+   __SIMD_DATATYPE vs_3_5 = _SIMD_BROADCAST(&scalarprods[8]);
+   __SIMD_DATATYPE vs_4_5 = _SIMD_BROADCAST(&scalarprods[9]);
+   __SIMD_DATATYPE vs_1_6 = _SIMD_BROADCAST(&scalarprods[10]);
+   __SIMD_DATATYPE vs_2_6 = _SIMD_BROADCAST(&scalarprods[11]);
+   __SIMD_DATATYPE vs_3_6 = _SIMD_BROADCAST(&scalarprods[12]);
+   __SIMD_DATATYPE vs_4_6 = _SIMD_BROADCAST(&scalarprods[13]);
+   __SIMD_DATATYPE vs_5_6 = _SIMD_BROADCAST(&scalarprods[14]);
+#endif
+#endif /* VEC_SET == AVX_256 */
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+    h1 = _XOR(tau1, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+    h1 = _SIMD_NEG(tau1);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau1, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau1, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau1;
+#endif
+
+   x1 = _SIMD_MUL(x1, h1);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == VSX_SSE || VEC_SET == AVX_256
+   h1 = _XOR(tau2, sign);
+#endif
+
+#if VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_NEG(tau2);
+#endif
+
+#if VEC_SET == AVX_512
+#ifdef HAVE_AVX512_XEON_PHI
+#ifdef DOUBLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi64((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    h1 = (__AVX512_DATATYPE) _mm512_xor_epi32((__AVX512i) tau2, (__AVX512i) sign);
+#endif
+#endif /* HAVE_AVX512_XEON_PHI */
+
+#ifdef HAVE_AVX512_XEON
+    h1 = _XOR(tau2, sign);
+#endif
+#endif /* VEC_SET == AVX_512 */
+   h2 = _SIMD_MUL(h1, vs);
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   h1 = tau2;
+   h2 = _SIMD_MUL(h1, vs_1_2); 
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMA(y1, h1, _SIMD_MUL(x1,h2));
+#else
+   y1 = _SIMD_ADD(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   y1 = _SIMD_FMSUB(y1, h1, _SIMD_MUL(x1,h2));
+#else
+   y1 = _SIMD_SUB(_SIMD_MUL(y1,h1), _SIMD_MUL(x1,h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau3;
+   h2 = _SIMD_MUL(h1, vs_1_3);
+   h3 = _SIMD_MUL(h1, vs_2_3);
+
+#ifdef __ELPA_USE_FMA__
+   z1 = _SIMD_FMSUB(z1, h1, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)));
+#else
+   z1 = _SIMD_SUB(_SIMD_MUL(z1,h1), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)));
+#endif /* __ELPA_USE_FMA__ */
+
+   h1 = tau4;
+   h2 = _SIMD_MUL(h1, vs_1_4);
+   h3 = _SIMD_MUL(h1, vs_2_4);
+   h4 = _SIMD_MUL(h1, vs_3_4);
+
+#ifdef __ELPA_USE_FMA__
+   w1 = _SIMD_FMSUB(w1, h1, _SIMD_FMA(z1, h4, _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+#else
+   w1 = _SIMD_SUB(_SIMD_MUL(w1,h1), _SIMD_ADD(_SIMD_MUL(z1,h4), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))); 
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+   h2 = _SIMD_MUL(tau5, vs_1_5); 
+   h3 = _SIMD_MUL(tau5, vs_2_5);
+
+   h4 = _SIMD_MUL(tau5, vs_3_5);
+   h5 = _SIMD_MUL(tau5, vs_4_5);
+
+
+#ifdef __ELPA_USE_FMA__
+   v1 = _SIMD_FMSUB(v1, tau5, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2))));
+#else
+   v1 = _SIMD_SUB(_SIMD_MUL(v1,tau5), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2))));
+#endif /* __ELPA_USE_FMA__ */
+
+   h2 = _SIMD_MUL(tau6, vs_1_6);
+   h3 = _SIMD_MUL(tau6, vs_2_6);
+   h4 = _SIMD_MUL(tau6, vs_3_6);
+   h5 = _SIMD_MUL(tau6, vs_4_6);
+   h6 = _SIMD_MUL(tau6, vs_5_6);
+
+#ifdef __ELPA_USE_FMA__
+   t1 = _SIMD_FMSUB(t1, tau6, _SIMD_FMA(v1, h6, _SIMD_ADD(_SIMD_FMA(w1, h5, _SIMD_MUL(z1,h4)), _SIMD_FMA(y1, h3, _SIMD_MUL(x1,h2)))));
+#else
+   t1 = _SIMD_SUB(_SIMD_MUL(t1,tau6), _SIMD_ADD( _SIMD_MUL(v1,h6), _SIMD_ADD(_SIMD_ADD(_SIMD_MUL(w1,h5), _SIMD_MUL(z1,h4)), _SIMD_ADD(_SIMD_MUL(y1,h3), _SIMD_MUL(x1,h2)))));
+#endif /* __ELPA_USE_FMA__ */
+
+   /////////////////////////////////////////////////////
+   // Rank-1 update of Q [ROW_LENGTH x nb+3]
+   /////////////////////////////////////////////////////
+#endif /* BLOCK6 */
+
+   q1 = _LOAD(&q[0]);
+#ifdef BLOCK2
+   q1 = _SIMD_ADD(q1, y1);
+#endif
+#ifdef BLOCK4
+   q1 = _SIMD_SUB(q1, w1);
+#endif
+#ifdef BLOCK6
+   q1 = _SIMD_SUB(q1, t1); 
+#endif
+   _STORE(&q[0],q1);
+
+#ifdef BLOCK2
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _LOAD(&q[ldq]);
+   q1 = _SIMD_ADD(q1, _SIMD_FMA(y1, h2, x1));
+   _STORE(&q[ldq],q1);
+#else
+   q1 = _LOAD(&q[ldq]);
+   q1 = _SIMD_ADD(q1, _SIMD_ADD(x1, _SIMD_MUL(y1, h2)));
+   _STORE(&q[ldq],q1);
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK2 */
+
+#ifdef BLOCK4
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_SUB(q1, _SIMD_FMA(w1, h4, z1));
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_ADD(z1, _SIMD_MUL(w1, h4)));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq],q1);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+   q1 = _LOAD(&q[ldq*2]);
+   q1 = _SIMD_SUB(q1, y1);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+   q1 = _LOAD(&q[ldq*3]);
+   q1 = _SIMD_SUB(q1, x1);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+1], hh[ldh+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*3], q1);
+
+#endif /* BLOCK4 */
+
+#ifdef BLOCK6
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq]);
+   q1 = _SIMD_SUB(q1, v1);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+#endif
+
+   _STORE(&q[ldq],q1);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*2]);
+   q1 = _SIMD_SUB(q1, w1);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5)); 
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[ldq*2],q1);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
+#endif
+
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*3]);
+   q1 = _SIMD_SUB(q1, z1);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+#endif
+
+   _STORE(&q[ldq*3],q1);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+1]);
+#endif
+
+   q1 = _LOAD(&q[ldq*4]);
+   q1 = _SIMD_SUB(q1, y1);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+#endif
+
+   _STORE(&q[ldq*4],q1);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[(ldh)+1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[(ldh)+1], hh[(ldh)+1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[(ldh)+1]);
+#endif
+   q1 = _LOAD(&q[ldq*5]);
+   q1 = _SIMD_SUB(q1, x1);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+3]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+4]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h6 = _SIMD_SET1(hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h6 = _SIMD_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
+#endif
+#if VEC_SET == AVX_256
+   h6 = _SIMD_BROADCAST(&hh[(ldh*5)+5]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(t1, h6, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+#endif
+
+   _STORE(&q[ldq*5],q1);
+
+#endif /* BLOCK6 */
+
+   for (i = BLOCK; i < nb; i++)
+   {
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h1 = _SIMD_SET1(hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET1(hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h1 = _SIMD_SET(hh[i-(BLOCK-1)], hh[i-(BLOCK-1)]);
+     h2 = _SIMD_SET(hh[ldh+i-(BLOCK-2)], hh[ldh+i-(BLOCK-2)]);
+#endif
+#if VEC_SET == AVX_256
+    h1 = _SIMD_BROADCAST(&hh[i-(BLOCK-1)]);
+    h2 = _SIMD_BROADCAST(&hh[ldh+i-(BLOCK-2)]);
+#endif
+
+     q1 = _LOAD(&q[i*ldq]);
+
+#ifdef BLOCK2
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_FMA(x1, h1, q1);
+     q1 = _SIMD_FMA(y1, h2, q1);
+#else
+     q1 = _SIMD_ADD(q1, _SIMD_ADD(_SIMD_MUL(x1,h1), _SIMD_MUL(y1, h2)));
+#endif /* __ELPA_USE_FMA__ */
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+     
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(x1, h1, q1);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(x1,h1));
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(y1, h2, q1);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(y1,h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h3 = _SIMD_SET1(hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h3 = _SIMD_SET(hh[(ldh*2)+i-(BLOCK-3)], hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h3 = _SIMD_BROADCAST(&hh[(ldh*2)+i-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(z1, h3, q1);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(z1,h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h4 = _SIMD_SET1(hh[(ldh*3)+i-(BLOCK-4)]); 
+#endif
+
+#if VEC_SET == SPARC64_SSE
+     h4 = _SIMD_SET(hh[(ldh*3)+i-(BLOCK-4)], hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h4 = _SIMD_BROADCAST(&hh[(ldh*3)+i-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(w1, h4, q1);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(w1,h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK4 || BLOCK6*/
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h5 = _SIMD_SET1(hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h5 = _SIMD_SET(hh[(ldh*4)+i-(BLOCK-5)], hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#if VEC_SET == AVX_256
+     h5 = _SIMD_BROADCAST(&hh[(ldh*4)+i-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(v1, h5, q1);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+     h6 = _SIMD_SET1(hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+     h6 = _SIMD_SET(hh[(ldh*5)+i], hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+#if VEC_SET == AVX_256
+     h6 = _SIMD_BROADCAST(&hh[(ldh*5)+i-(BLOCK-6)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+     q1 = _SIMD_NFMA(t1, h6, q1);
+#else
+     q1 = _SIMD_SUB(q1, _SIMD_MUL(t1, h6));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+     _STORE(&q[i*ldq],q1);
+
+   }
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-1)], hh[nb-(BLOCK-1)]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-1)]);
+#endif
+
+   q1 = _LOAD(&q[nb*ldq]);
+
+#ifdef BLOCK2
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_FMA(x1, h1, q1);
+#else
+   q1 = _SIMD_ADD(q1, _SIMD_MUL(x1, h1));
+#endif
+#endif /* BLOCK2 */
+
+#if defined(BLOCK4) || defined(BLOCK6)
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-2)], hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-2)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-3)], hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+#endif
+
+#endif /* BLOCK4 || BLOCK6 */
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-(BLOCK-4)], hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h5 = _SIMD_SET1(hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h5 = _SIMD_SET(hh[(ldh*4)+nb-(BLOCK-5)], hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == AVX_256
+   h5 = _SIMD_BROADCAST(&hh[(ldh*4)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(v1, h5, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(v1, h5));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[nb*ldq],q1);
+
+#if defined(BLOCK4) || defined(BLOCK6)
+   
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-2)], hh[nb-(BLOCK-2)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-2)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+1)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-(BLOCK-3)], hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-(BLOCK-3)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-(BLOCK-4)], hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-(BLOCK-4)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+#endif
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h4 = _SIMD_SET1(hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h4 = _SIMD_SET(hh[(ldh*3)+nb-(BLOCK-5)], hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+#if VEC_SET == AVX_256
+   h4 = _SIMD_BROADCAST(&hh[(ldh*3)+nb-(BLOCK-5)]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(w1, h4, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(w1, h4));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+   _STORE(&q[(nb+1)*ldq],q1);
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-(BLOCK-3)], hh[nb-(BLOCK-3)]);
+#endif
+
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-(BLOCK-3)]);
+#endif
+
+   q1 = _LOAD(&q[(nb+2)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+#endif
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
+#endif
+
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-2]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h3 = _SIMD_SET1(hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h3 = _SIMD_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h3 = _SIMD_BROADCAST(&hh[(ldh*2)+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(z1, h3, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(z1, h3));
+#endif /* __ELPA_USE_FMA__ */
+
+#endif /* BLOCK6 */
+
+   _STORE(&q[(nb+2)*ldq],q1);
+
+#endif /* BLOCK4 || BLOCK6*/
+
+#ifdef BLOCK6
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-2]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-2], hh[nb-2]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-2]);
+#endif
+
+   q1 = _LOAD(&q[(nb+3)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h2 = _SIMD_SET1(hh[ldh+nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h2 = _SIMD_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h2 = _SIMD_BROADCAST(&hh[ldh+nb-1]);
+#endif
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(y1, h2, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(y1, h2));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+3)*ldq],q1);
+#if VEC_SET == SSE_128 || VEC_SET == AVX_512 || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
+   h1 = _SIMD_SET1(hh[nb-1]);
+#endif
+#if VEC_SET == SPARC64_SSE
+   h1 = _SIMD_SET(hh[nb-1], hh[nb-1]);
+#endif
+#if VEC_SET == AVX_256
+   h1 = _SIMD_BROADCAST(&hh[nb-1]);
+#endif
+
+   q1 = _LOAD(&q[(nb+4)*ldq]);
+
+#ifdef __ELPA_USE_FMA__
+   q1 = _SIMD_NFMA(x1, h1, q1);
+#else
+   q1 = _SIMD_SUB(q1, _SIMD_MUL(x1, h1));
+#endif /* __ELPA_USE_FMA__ */
+
+   _STORE(&q[(nb+4)*ldq],q1);
+
+#endif /* BLOCK6 */
+
+}
+
+#undef SIMD_SET
+#undef OFFSET
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx512_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx512_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx512_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx512_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx512_2hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx512_2hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx512_2hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx512_2hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx512_4hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx512_4hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx512_4hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx512_4hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK4 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK4
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx512_4hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx512_4hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx512_4hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx512_4hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK4 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef BLOCK4
+#undef VEC_SET
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx512_6hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx512_6hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx512_6hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx512_6hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define VEC_SET AVX_512
+#define BLOCK6 1
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK6
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx512_6hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx512_6hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx512_6hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx512_6hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET AVX_512
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK6
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET AVX_256
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET AVX_256
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK4 1
+#define VEC_SET AVX_256
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK4
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK4
+#define VEC_SET AVX_256
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK4
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET AVX_256
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef BLOCK6
+#undef VEC_SET
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET AVX_256
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef BLOCK6
+#undef VEC_SET
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_bgp.f90 elpa-2019.11.001/src/elpa2/kernels/real_bgp.f90
--- elpa-2016.05.001/src/elpa2/kernels/real_bgp.f90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_bgp.f90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,799 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! *** Special IBM BlueGene/P version with BlueGene assembler instructions in Fortran ***
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+!module real_bgp_kernel
+
+!  private
+!  public double_hh_trafo_bgp
+!contains
+  subroutine double_hh_trafo_bgp(q, hh, nb, nq, ldq, ldh)
+    use precision
+
+    implicit none
+
+    integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
+    real(kind=rk8), intent(inout) :: q(ldq,*)
+    real(kind=rk8), intent(in)    :: hh(ldh,*)
+
+    real(kind=rk8)                :: s
+    integer(kind=ik)             :: i
+
+    ! Safety only:
+
+    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
+    if(mod(c_loc(q),16) /= 0) STOP 'Q unaligned!'
+
+    ! Calculate dot product of the two Householder vectors
+
+    s = hh(2,2)*1
+    do i=3,nb
+       s = s+hh(i,2)*hh(i-1,1)
+    enddo
+
+    do i=1,nq-16,20
+       call hh_trafo_kernel_10_bgp(q(i   ,1), hh, nb, ldq, ldh, s)
+       call hh_trafo_kernel_10_bgp(q(i+10,1), hh, nb, ldq, ldh, s)
+    enddo
+
+    ! i > nq-16 now, i.e. at most 16 rows remain
+
+    if(nq-i+1 > 12) then
+       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
+       call hh_trafo_kernel_8_bgp(q(i+8,1), hh, nb, ldq, ldh, s)
+    else if(nq-i+1 > 8) then
+       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
+       call hh_trafo_kernel_4_bgp(q(i+8,1), hh, nb, ldq, ldh, s)
+    else if(nq-i+1 > 4) then
+       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
+    else if(nq-i+1 > 0) then
+       call hh_trafo_kernel_4_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
+    endif
+
+  end subroutine double_hh_trafo_bgp
+
+  ! --------------------------------------------------------------------------------------------------
+  ! The following kernels perform the Householder transformation on Q for 10/8/4 rows.
+  ! Please note that Q is declared complex*16 here.
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_10_bgp(q, hh, nb, ldq, ldh, s)
+
+    use precision
+    use elpa_mpi
+    implicit none
+
+
+    integer(kind=ik), intent(in)    :: nb, ldq, ldh
+    complex(kind=ck8), intent(inout) :: q(ldq/2,*)
+    real(kind=rk8), intent(in)       :: hh(ldh,*), s
+
+    complex(kind=ck8)                :: x1, x2, x3, x4, x5, y1, y2, y3, y4, y5, q1, q2, q3, q4, q5, p1, p2, p3, p4, p5
+    real(kind=rk8)                   :: h1, h2
+    integer(kind=ik)                :: i
+
+    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
+    !   real*8 x
+    !   loadfp(a) = a
+    !   fxcpmadd(a,b,x) = a + b*x
+    !   fxpmul(a,x) = a*x
+    !   fpadd(a,b) = a+b
+    !
+    call alignx(16,q)
+
+
+    x1 = loadfp(q(1,2))
+    x2 = loadfp(q(2,2))
+    x3 = loadfp(q(3,2))
+    x4 = loadfp(q(4,2))
+    x5 = loadfp(q(5,2))
+
+    h2 = hh(2,2)
+    y1 = loadfp(q(1,1))
+    y2 = loadfp(q(2,1))
+    y3 = loadfp(q(3,1))
+    y4 = loadfp(q(4,1))
+    y5 = loadfp(q(5,1))
+    y1 = fxcpmadd(y1,x1,h2)
+    q1 = loadfp(q(1,3))
+    y2 = fxcpmadd(y2,x2,h2)
+    q2 = loadfp(q(2,3))
+    y3 = fxcpmadd(y3,x3,h2)
+    q3 = loadfp(q(3,3))
+    y4 = fxcpmadd(y4,x4,h2)
+    q4 = loadfp(q(4,3))
+    y5 = fxcpmadd(y5,x5,h2)
+    q5 = loadfp(q(5,3))
+
+    h1 = hh(3-1,1)
+
+    do i=3,nb,2
+
+       h2 = hh(i,2)
+
+       x1 = fxcpmadd(x1,q1,h1)
+       x2 = fxcpmadd(x2,q2,h1)
+       x3 = fxcpmadd(x3,q3,h1)
+       x4 = fxcpmadd(x4,q4,h1)
+       x5 = fxcpmadd(x5,q5,h1)
+
+       h1 = hh(i  ,1)
+
+       y1 = fxcpmadd(y1,q1,h2)
+       q1 = loadfp(q(1,i+1))
+       y2 = fxcpmadd(y2,q2,h2)
+       q2 = loadfp(q(2,i+1))
+       y3 = fxcpmadd(y3,q3,h2)
+       q3 = loadfp(q(3,i+1))
+       y4 = fxcpmadd(y4,q4,h2)
+       q4 = loadfp(q(4,i+1))
+       y5 = fxcpmadd(y5,q5,h2)
+       q5 = loadfp(q(5,i+1))
+
+       if(i==nb) exit
+
+       h2 = hh(i+1,2)
+
+       x1 = fxcpmadd(x1,q1,h1)
+       x2 = fxcpmadd(x2,q2,h1)
+       x3 = fxcpmadd(x3,q3,h1)
+       x4 = fxcpmadd(x4,q4,h1)
+       x5 = fxcpmadd(x5,q5,h1)
+
+       h1 = hh(i+1,1)
+
+       y1 = fxcpmadd(y1,q1,h2)
+       q1 = loadfp(q(1,i+2))
+       y2 = fxcpmadd(y2,q2,h2)
+       q2 = loadfp(q(2,i+2))
+       y3 = fxcpmadd(y3,q3,h2)
+       q3 = loadfp(q(3,i+2))
+       y4 = fxcpmadd(y4,q4,h2)
+       q4 = loadfp(q(4,i+2))
+       y5 = fxcpmadd(y5,q5,h2)
+       q5 = loadfp(q(5,i+2))
+
+    enddo
+
+    x1 = fxcpmadd(x1,q1,h1)
+    x2 = fxcpmadd(x2,q2,h1)
+    x3 = fxcpmadd(x3,q3,h1)
+    x4 = fxcpmadd(x4,q4,h1)
+    x5 = fxcpmadd(x5,q5,h1)
+
+    h1 = -hh(1,1) ! for below
+    h2 = -hh(1,2)
+    x1 = fxpmul(x1,h1)
+    x2 = fxpmul(x2,h1)
+    x3 = fxpmul(x3,h1)
+    x4 = fxpmul(x4,h1)
+    x5 = fxpmul(x5,h1)
+    h1 = -hh(1,2)*s
+    y1 = fxpmul(y1,h2)
+    y2 = fxpmul(y2,h2)
+    y3 = fxpmul(y3,h2)
+    y4 = fxpmul(y4,h2)
+    y5 = fxpmul(y5,h2)
+    y1 = fxcpmadd(y1,x1,h1)
+    q1 = loadfp(q(1,1))
+    y2 = fxcpmadd(y2,x2,h1)
+    q2 = loadfp(q(2,1))
+    y3 = fxcpmadd(y3,x3,h1)
+    q3 = loadfp(q(3,1))
+    y4 = fxcpmadd(y4,x4,h1)
+    q4 = loadfp(q(4,1))
+    y5 = fxcpmadd(y5,x5,h1)
+    q5 = loadfp(q(5,1))
+
+    q1 = fpadd(q1,y1)
+    p1 = loadfp(q(1,2))
+    q2 = fpadd(q2,y2)
+    p2 = loadfp(q(2,2))
+    q3 = fpadd(q3,y3)
+    p3 = loadfp(q(3,2))
+    q4 = fpadd(q4,y4)
+    p4 = loadfp(q(4,2))
+    q5 = fpadd(q5,y5)
+    p5 = loadfp(q(5,2))
+
+    h2 = hh(2,2)
+
+    call storefp(q(1,1),q1)
+    p1 = fpadd(p1,x1)
+    call storefp(q(2,1),q2)
+    p2 = fpadd(p2,x2)
+    call storefp(q(3,1),q3)
+    p3 = fpadd(p3,x3)
+    call storefp(q(4,1),q4)
+    p4 = fpadd(p4,x4)
+    call storefp(q(5,1),q5)
+    p5 = fpadd(p5,x5)
+
+    p1 = fxcpmadd(p1,y1,h2)
+    q1 = loadfp(q(1,3))
+    p2 = fxcpmadd(p2,y2,h2)
+    q2 = loadfp(q(2,3))
+    p3 = fxcpmadd(p3,y3,h2)
+    q3 = loadfp(q(3,3))
+    p4 = fxcpmadd(p4,y4,h2)
+    q4 = loadfp(q(4,3))
+    p5 = fxcpmadd(p5,y5,h2)
+    q5 = loadfp(q(5,3))
+
+    h1 = hh(3-1,1)
+
+    do i=3,nb,2
+
+       h2 = hh(i,2)
+
+       call storefp(q(1,i-1),p1)
+       q1 = fxcpmadd(q1,x1,h1)
+       call storefp(q(2,i-1),p2)
+       q2 = fxcpmadd(q2,x2,h1)
+       call storefp(q(3,i-1),p3)
+       q3 = fxcpmadd(q3,x3,h1)
+       call storefp(q(4,i-1),p4)
+       q4 = fxcpmadd(q4,x4,h1)
+       call storefp(q(5,i-1),p5)
+       q5 = fxcpmadd(q5,x5,h1)
+
+       h1 = hh(i,1)
+
+       q1 = fxcpmadd(q1,y1,h2)
+       p1 = loadfp(q(1,i+1))
+       q2 = fxcpmadd(q2,y2,h2)
+       p2 = loadfp(q(2,i+1))
+       q3 = fxcpmadd(q3,y3,h2)
+       p3 = loadfp(q(3,i+1))
+       q4 = fxcpmadd(q4,y4,h2)
+       p4 = loadfp(q(4,i+1))
+       q5 = fxcpmadd(q5,y5,h2)
+       p5 = loadfp(q(5,i+1))
+
+       if(i==nb) exit
+
+       h2 = hh(i+1,2)
+
+       call storefp(q(1,i),q1)
+       p1 = fxcpmadd(p1,x1,h1)
+       call storefp(q(2,i),q2)
+       p2 = fxcpmadd(p2,x2,h1)
+       call storefp(q(3,i),q3)
+       p3 = fxcpmadd(p3,x3,h1)
+       call storefp(q(4,i),q4)
+       p4 = fxcpmadd(p4,x4,h1)
+       call storefp(q(5,i),q5)
+       p5 = fxcpmadd(p5,x5,h1)
+
+       h1 = hh(i+1,1)
+
+       p1 = fxcpmadd(p1,y1,h2)
+       q1 = loadfp(q(1,i+2))
+       p2 = fxcpmadd(p2,y2,h2)
+       q2 = loadfp(q(2,i+2))
+       p3 = fxcpmadd(p3,y3,h2)
+       q3 = loadfp(q(3,i+2))
+       p4 = fxcpmadd(p4,y4,h2)
+       q4 = loadfp(q(4,i+2))
+       p5 = fxcpmadd(p5,y5,h2)
+       q5 = loadfp(q(5,i+2))
+
+    enddo
+
+
+    if(i==nb) then
+       call storefp(q(1,nb),q1)
+       p1 = fxcpmadd(p1,x1,h1)
+       call storefp(q(2,nb),q2)
+       p2 = fxcpmadd(p2,x2,h1)
+       call storefp(q(3,nb),q3)
+       p3 = fxcpmadd(p3,x3,h1)
+       call storefp(q(4,nb),q4)
+       p4 = fxcpmadd(p4,x4,h1)
+       call storefp(q(5,nb),q5)
+       p5 = fxcpmadd(p5,x5,h1)
+
+       call storefp(q(1,nb+1),p1)
+       call storefp(q(2,nb+1),p2)
+       call storefp(q(3,nb+1),p3)
+       call storefp(q(4,nb+1),p4)
+       call storefp(q(5,nb+1),p5)
+    else
+       call storefp(q(1,nb),p1)
+       q1 = fxcpmadd(q1,x1,h1)
+       call storefp(q(2,nb),p2)
+       q2 = fxcpmadd(q2,x2,h1)
+       call storefp(q(3,nb),p3)
+       q3 = fxcpmadd(q3,x3,h1)
+       call storefp(q(4,nb),p4)
+       q4 = fxcpmadd(q4,x4,h1)
+       call storefp(q(5,nb),p5)
+       q5 = fxcpmadd(q5,x5,h1)
+
+       call storefp(q(1,nb+1),q1)
+       call storefp(q(2,nb+1),q2)
+       call storefp(q(3,nb+1),q3)
+       call storefp(q(4,nb+1),q4)
+       call storefp(q(5,nb+1),q5)
+    endif
+
+
+    !contains
+    !
+    !   subroutine storefp(a,b)
+    !      complex*16 a, b
+    !
+    !      a = b
+    !   end subroutine
+    !   subroutine alignx(n, x)
+    !      integer n
+    !      complex*16 x(ldq/2,*)
+    !   end subroutine
+
+  end subroutine hh_trafo_kernel_10_bgp
+
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_8_bgp(q, hh, nb, ldq, ldh, s)
+
+    use precision
+    use elpa_mpi
+    implicit none
+
+
+    integer(kind=ik), intent(in)    :: nb, ldq, ldh
+    complex(kind=ck8), intent(inout) :: q(ldq/2,*)
+    real(kind=rk8), intent(in)       :: hh(ldh,*), s
+
+    complex(kind=ck8)                :: x1, x2, x3, x4, y1, y2, y3, y4, q1, q2, q3, q4, p1, p2, p3, p4
+    real(kind=rk8)                   :: h1, h2
+    integer(kind=ik)                :: i
+
+    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
+    !   real*8 x
+    !   loadfp(a) = a
+    !   fxcpmadd(a,b,x) = a + b*x
+    !   fxpmul(a,x) = a*x
+    !   fpadd(a,b) = a+b
+
+    call alignx(16,q)
+
+
+    x1 = loadfp(q(1,2))
+    x2 = loadfp(q(2,2))
+    x3 = loadfp(q(3,2))
+    x4 = loadfp(q(4,2))
+
+    h2 = hh(2,2)
+    y1 = loadfp(q(1,1))
+    y2 = loadfp(q(2,1))
+    y3 = loadfp(q(3,1))
+    y4 = loadfp(q(4,1))
+    y1 = fxcpmadd(y1,x1,h2)
+    q1 = loadfp(q(1,3))
+    y2 = fxcpmadd(y2,x2,h2)
+    q2 = loadfp(q(2,3))
+    y3 = fxcpmadd(y3,x3,h2)
+    q3 = loadfp(q(3,3))
+    y4 = fxcpmadd(y4,x4,h2)
+    q4 = loadfp(q(4,3))
+
+    h1 = hh(3-1,1)
+
+    do i=3,nb,2
+
+       h2 = hh(i,2)
+
+       x1 = fxcpmadd(x1,q1,h1)
+       x2 = fxcpmadd(x2,q2,h1)
+       x3 = fxcpmadd(x3,q3,h1)
+       x4 = fxcpmadd(x4,q4,h1)
+
+       h1 = hh(i  ,1)
+
+       y1 = fxcpmadd(y1,q1,h2)
+       q1 = loadfp(q(1,i+1))
+       y2 = fxcpmadd(y2,q2,h2)
+       q2 = loadfp(q(2,i+1))
+       y3 = fxcpmadd(y3,q3,h2)
+       q3 = loadfp(q(3,i+1))
+       y4 = fxcpmadd(y4,q4,h2)
+       q4 = loadfp(q(4,i+1))
+
+       if(i==nb) exit
+
+       h2 = hh(i+1,2)
+
+       x1 = fxcpmadd(x1,q1,h1)
+       x2 = fxcpmadd(x2,q2,h1)
+       x3 = fxcpmadd(x3,q3,h1)
+       x4 = fxcpmadd(x4,q4,h1)
+
+       h1 = hh(i+1,1)
+
+       y1 = fxcpmadd(y1,q1,h2)
+       q1 = loadfp(q(1,i+2))
+       y2 = fxcpmadd(y2,q2,h2)
+       q2 = loadfp(q(2,i+2))
+       y3 = fxcpmadd(y3,q3,h2)
+       q3 = loadfp(q(3,i+2))
+       y4 = fxcpmadd(y4,q4,h2)
+       q4 = loadfp(q(4,i+2))
+
+    enddo
+
+    x1 = fxcpmadd(x1,q1,h1)
+    x2 = fxcpmadd(x2,q2,h1)
+    x3 = fxcpmadd(x3,q3,h1)
+    x4 = fxcpmadd(x4,q4,h1)
+
+    h1 = -hh(1,1) ! for below
+    h2 = -hh(1,2)
+    x1 = fxpmul(x1,h1)
+    x2 = fxpmul(x2,h1)
+    x3 = fxpmul(x3,h1)
+    x4 = fxpmul(x4,h1)
+    h1 = -hh(1,2)*s
+    y1 = fxpmul(y1,h2)
+    y2 = fxpmul(y2,h2)
+    y3 = fxpmul(y3,h2)
+    y4 = fxpmul(y4,h2)
+    y1 = fxcpmadd(y1,x1,h1)
+    q1 = loadfp(q(1,1))
+    y2 = fxcpmadd(y2,x2,h1)
+    q2 = loadfp(q(2,1))
+    y3 = fxcpmadd(y3,x3,h1)
+    q3 = loadfp(q(3,1))
+    y4 = fxcpmadd(y4,x4,h1)
+    q4 = loadfp(q(4,1))
+
+    q1 = fpadd(q1,y1)
+    p1 = loadfp(q(1,2))
+    q2 = fpadd(q2,y2)
+    p2 = loadfp(q(2,2))
+    q3 = fpadd(q3,y3)
+    p3 = loadfp(q(3,2))
+    q4 = fpadd(q4,y4)
+    p4 = loadfp(q(4,2))
+
+    h2 = hh(2,2)
+
+    call storefp(q(1,1),q1)
+    p1 = fpadd(p1,x1)
+    call storefp(q(2,1),q2)
+    p2 = fpadd(p2,x2)
+    call storefp(q(3,1),q3)
+    p3 = fpadd(p3,x3)
+    call storefp(q(4,1),q4)
+    p4 = fpadd(p4,x4)
+
+    p1 = fxcpmadd(p1,y1,h2)
+    q1 = loadfp(q(1,3))
+    p2 = fxcpmadd(p2,y2,h2)
+    q2 = loadfp(q(2,3))
+    p3 = fxcpmadd(p3,y3,h2)
+    q3 = loadfp(q(3,3))
+    p4 = fxcpmadd(p4,y4,h2)
+    q4 = loadfp(q(4,3))
+
+    h1 = hh(3-1,1)
+
+    do i=3,nb,2
+
+       h2 = hh(i,2)
+
+       call storefp(q(1,i-1),p1)
+       q1 = fxcpmadd(q1,x1,h1)
+       call storefp(q(2,i-1),p2)
+       q2 = fxcpmadd(q2,x2,h1)
+       call storefp(q(3,i-1),p3)
+       q3 = fxcpmadd(q3,x3,h1)
+       call storefp(q(4,i-1),p4)
+       q4 = fxcpmadd(q4,x4,h1)
+
+       h1 = hh(i,1)
+
+       q1 = fxcpmadd(q1,y1,h2)
+       p1 = loadfp(q(1,i+1))
+       q2 = fxcpmadd(q2,y2,h2)
+       p2 = loadfp(q(2,i+1))
+       q3 = fxcpmadd(q3,y3,h2)
+       p3 = loadfp(q(3,i+1))
+       q4 = fxcpmadd(q4,y4,h2)
+       p4 = loadfp(q(4,i+1))
+
+       if(i==nb) exit
+
+       h2 = hh(i+1,2)
+
+       call storefp(q(1,i),q1)
+       p1 = fxcpmadd(p1,x1,h1)
+       call storefp(q(2,i),q2)
+       p2 = fxcpmadd(p2,x2,h1)
+       call storefp(q(3,i),q3)
+       p3 = fxcpmadd(p3,x3,h1)
+       call storefp(q(4,i),q4)
+       p4 = fxcpmadd(p4,x4,h1)
+
+       h1 = hh(i+1,1)
+
+       p1 = fxcpmadd(p1,y1,h2)
+       q1 = loadfp(q(1,i+2))
+       p2 = fxcpmadd(p2,y2,h2)
+       q2 = loadfp(q(2,i+2))
+       p3 = fxcpmadd(p3,y3,h2)
+       q3 = loadfp(q(3,i+2))
+       p4 = fxcpmadd(p4,y4,h2)
+       q4 = loadfp(q(4,i+2))
+
+    enddo
+
+
+    if(i==nb) then
+       call storefp(q(1,nb),q1)
+       p1 = fxcpmadd(p1,x1,h1)
+       call storefp(q(2,nb),q2)
+       p2 = fxcpmadd(p2,x2,h1)
+       call storefp(q(3,nb),q3)
+       p3 = fxcpmadd(p3,x3,h1)
+       call storefp(q(4,nb),q4)
+       p4 = fxcpmadd(p4,x4,h1)
+
+       call storefp(q(1,nb+1),p1)
+       call storefp(q(2,nb+1),p2)
+       call storefp(q(3,nb+1),p3)
+       call storefp(q(4,nb+1),p4)
+    else
+       call storefp(q(1,nb),p1)
+       q1 = fxcpmadd(q1,x1,h1)
+       call storefp(q(2,nb),p2)
+       q2 = fxcpmadd(q2,x2,h1)
+       call storefp(q(3,nb),p3)
+       q3 = fxcpmadd(q3,x3,h1)
+       call storefp(q(4,nb),p4)
+       q4 = fxcpmadd(q4,x4,h1)
+
+       call storefp(q(1,nb+1),q1)
+       call storefp(q(2,nb+1),q2)
+       call storefp(q(3,nb+1),q3)
+       call storefp(q(4,nb+1),q4)
+    endif
+
+
+    !contains
+    !
+    !   subroutine storefp(a,b)
+    !      complex*16 a, b
+    !
+    !      a = b
+    !   end subroutine
+    !   subroutine alignx(n, x)
+    !      integer n
+    !      complex*16 x(ldq/2,*)
+    !   end subroutine
+
+  end subroutine hh_trafo_kernel_8_bgp
+
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_4_bgp(q, hh, nb, ldq, ldh, s)
+
+    use precision
+    use elpa_mpi
+    implicit none
+
+
+    integer(kind=ik), intent(in)    :: nb, ldq, ldh
+    complex(kind=ck8), intent(inout) :: q(ldq/2,*)
+    real(kind=rk8), intent(in)       :: hh(ldh,*), s
+
+    complex(kind=ck8)                :: x1, x2, y1, y2, q1, q2, p1, p2
+    real(kind=rk8)                   :: h1, h2
+    integer(kind=ik)                :: i
+
+    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
+    !   real*8 x
+    !   loadfp(a) = a
+    !   fxcpmadd(a,b,x) = a + b*x
+    !   fxpmul(a,x) = a*x
+    !   fpadd(a,b) = a+b
+
+    call alignx(16,q)
+
+
+    x1 = loadfp(q(1,2))
+    x2 = loadfp(q(2,2))
+
+    h2 = hh(2,2)
+    y1 = loadfp(q(1,1))
+    y2 = loadfp(q(2,1))
+    y1 = fxcpmadd(y1,x1,h2)
+    q1 = loadfp(q(1,3))
+    y2 = fxcpmadd(y2,x2,h2)
+    q2 = loadfp(q(2,3))
+
+    h1 = hh(3-1,1)
+
+    do i=3,nb,2
+
+       h2 = hh(i,2)
+
+       x1 = fxcpmadd(x1,q1,h1)
+       x2 = fxcpmadd(x2,q2,h1)
+
+       h1 = hh(i  ,1)
+
+       y1 = fxcpmadd(y1,q1,h2)
+       q1 = loadfp(q(1,i+1))
+       y2 = fxcpmadd(y2,q2,h2)
+       q2 = loadfp(q(2,i+1))
+
+       if(i==nb) exit
+
+       h2 = hh(i+1,2)
+
+       x1 = fxcpmadd(x1,q1,h1)
+       x2 = fxcpmadd(x2,q2,h1)
+
+       h1 = hh(i+1,1)
+
+       y1 = fxcpmadd(y1,q1,h2)
+       q1 = loadfp(q(1,i+2))
+       y2 = fxcpmadd(y2,q2,h2)
+       q2 = loadfp(q(2,i+2))
+
+    enddo
+
+    x1 = fxcpmadd(x1,q1,h1)
+    x2 = fxcpmadd(x2,q2,h1)
+
+    h1 = -hh(1,1) ! for below
+    h2 = -hh(1,2)
+    x1 = fxpmul(x1,h1)
+    x2 = fxpmul(x2,h1)
+    h1 = -hh(1,2)*s
+    y1 = fxpmul(y1,h2)
+    y2 = fxpmul(y2,h2)
+    y1 = fxcpmadd(y1,x1,h1)
+    q1 = loadfp(q(1,1))
+    y2 = fxcpmadd(y2,x2,h1)
+    q2 = loadfp(q(2,1))
+
+    q1 = fpadd(q1,y1)
+    p1 = loadfp(q(1,2))
+    q2 = fpadd(q2,y2)
+    p2 = loadfp(q(2,2))
+
+    h2 = hh(2,2)
+
+    call storefp(q(1,1),q1)
+    p1 = fpadd(p1,x1)
+    call storefp(q(2,1),q2)
+    p2 = fpadd(p2,x2)
+
+    p1 = fxcpmadd(p1,y1,h2)
+    q1 = loadfp(q(1,3))
+    p2 = fxcpmadd(p2,y2,h2)
+    q2 = loadfp(q(2,3))
+
+    h1 = hh(3-1,1)
+
+    do i=3,nb,2
+
+       h2 = hh(i,2)
+
+       call storefp(q(1,i-1),p1)
+       q1 = fxcpmadd(q1,x1,h1)
+       call storefp(q(2,i-1),p2)
+       q2 = fxcpmadd(q2,x2,h1)
+
+       h1 = hh(i,1)
+
+       q1 = fxcpmadd(q1,y1,h2)
+       p1 = loadfp(q(1,i+1))
+       q2 = fxcpmadd(q2,y2,h2)
+       p2 = loadfp(q(2,i+1))
+
+       if(i==nb) exit
+
+       h2 = hh(i+1,2)
+
+       call storefp(q(1,i),q1)
+       p1 = fxcpmadd(p1,x1,h1)
+       call storefp(q(2,i),q2)
+       p2 = fxcpmadd(p2,x2,h1)
+
+       h1 = hh(i+1,1)
+
+       p1 = fxcpmadd(p1,y1,h2)
+       q1 = loadfp(q(1,i+2))
+       p2 = fxcpmadd(p2,y2,h2)
+       q2 = loadfp(q(2,i+2))
+
+    enddo
+
+
+    if(i==nb) then
+       call storefp(q(1,nb),q1)
+       p1 = fxcpmadd(p1,x1,h1)
+       call storefp(q(2,nb),q2)
+       p2 = fxcpmadd(p2,x2,h1)
+
+       call storefp(q(1,nb+1),p1)
+       call storefp(q(2,nb+1),p2)
+    else
+       call storefp(q(1,nb),p1)
+       q1 = fxcpmadd(q1,x1,h1)
+       call storefp(q(2,nb),p2)
+       q2 = fxcpmadd(q2,x2,h1)
+
+       call storefp(q(1,nb+1),q1)
+       call storefp(q(2,nb+1),q2)
+    endif
+
+
+    !contains
+    !
+    !   subroutine storefp(a,b)
+    !      complex*16 a, b
+    !
+    !      a = b
+    !   end subroutine
+    !   subroutine alignx(n, x)
+    !      integer n
+    !      complex*16 x(ldq/2,*)
+    !   end subroutine
+
+  end subroutine hh_trafo_kernel_4_bgp
+!end module real_bgp_kernel
+! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_bgq.f90 elpa-2019.11.001/src/elpa2/kernels/real_bgq.f90
--- elpa-2016.05.001/src/elpa2/kernels/real_bgq.f90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_bgq.f90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,658 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! *** Special IBM BlueGene/Q version with QPX intrinsics in Fortran ***
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+module real_bgq_kernel
+
+  private
+  public double_hh_trafo_bgq
+contains
+  subroutine double_hh_trafo_bgq(q, hh, nb, nq, ldq, ldh)
+    use precision
+    implicit none
+
+    integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
+    real(kind=rk8), intent(inout) :: q(ldq,*)
+    real(kind=rk8), intent(in)    :: hh(ldh,*)
+
+    real(kind=rk8)                :: s
+    integer(kind=ik)             :: i
+
+    ! Safety only:
+
+    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
+
+    call alignx(32,q)
+
+    ! Calculate dot product of the two Householder vectors
+
+    s = hh(2,2)*1
+    do i=3,nb
+       s = s+hh(i,2)*hh(i-1,1)
+    enddo
+
+    do i=1,nq-20,24
+       call hh_trafo_kernel_24_bgq(q(i   ,1), hh, nb, ldq, ldh, s)
+    enddo
+
+    if(nq-i+1 > 16) then
+       call hh_trafo_kernel_16_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
+       call hh_trafo_kernel_4_bgq(q(i+16,1), hh, nb, ldq, ldh, s)
+    else if(nq-i+1 > 12) then
+       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
+       call hh_trafo_kernel_8_bgq(q(i+8,1), hh, nb, ldq, ldh, s)
+    else if(nq-i+1 > 8) then
+       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
+       call hh_trafo_kernel_4_bgq(q(i+8,1), hh, nb, ldq, ldh, s)
+    else if(nq-i+1 > 4) then
+       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
+    else if(nq-i+1 > 0) then
+       call hh_trafo_kernel_4_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
+    endif
+
+  end subroutine double_hh_trafo_bgq
+
+
+  ! --------------------------------------------------------------------------------------------------
+  ! The following kernels perform the Householder transformation on Q for 24/16/8/4 rows.
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_24_bgq(q, hh, nb, ldq, ldh, s)
+    use precision
+    use elpa_mpi
+    implicit none
+
+    integer(kind=ik), intent(in) :: nb, ldq, ldh
+
+    real(kind=rk8), intent(inout) :: q(ldq,*)
+    real(kind=rk8), intent(in)    :: hh(ldh,*), s
+
+    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_x3, QPX_x4, QPX_x5, QPX_x6
+    VECTOR(REAL(8))::QPX_y1, QPX_y2, QPX_y3, QPX_y4, QPX_y5, QPX_y6
+    VECTOR(REAL(8))::QPX_q1, QPX_q2, QPX_q3, QPX_q4, QPX_q5, QPX_q6
+    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
+    integer i
+
+    call alignx(32,q)
+
+    !--- multiply Householder vectors with matrix q ---
+
+    QPX_x1 = VEC_LD(0,q(1,2))
+    QPX_x2 = VEC_LD(0,q(5,2))
+    QPX_x3 = VEC_LD(0,q(9,2))
+    QPX_x4 = VEC_LD(0,q(13,2))
+    QPX_x5 = VEC_LD(0,q(17,2))
+    QPX_x6 = VEC_LD(0,q(21,2))
+
+    QPX_h2 = VEC_SPLATS(hh(2,2))
+    QPX_q1 = VEC_LD(0,q(1,1))
+    QPX_q2 = VEC_LD(0,q(5,1))
+    QPX_q3 = VEC_LD(0,q(9,1))
+    QPX_q4 = VEC_LD(0,q(13,1))
+    QPX_q5 = VEC_LD(0,q(17,1))
+    QPX_q6 = VEC_LD(0,q(21,1))
+    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
+    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
+    QPX_y3 = VEC_MADD(QPX_x3, QPX_h2, QPX_q3)
+    QPX_y4 = VEC_MADD(QPX_x4, QPX_h2, QPX_q4)
+    QPX_y5 = VEC_MADD(QPX_x5, QPX_h2, QPX_q5)
+    QPX_y6 = VEC_MADD(QPX_x6, QPX_h2, QPX_q6)
+
+    do i=3,nb,1
+
+       QPX_q1 = VEC_LD(0,q(1,i))
+       QPX_q2 = VEC_LD(0,q(5,i))
+       QPX_q3 = VEC_LD(0,q(9,i))
+       QPX_q4 = VEC_LD(0,q(13,i))
+       QPX_q5 = VEC_LD(0,q(17,i))
+       QPX_q6 = VEC_LD(0,q(21,i))
+       QPX_h1 = VEC_SPLATS(hh(i-1,1))
+       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
+       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
+       QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
+       QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
+       QPX_x5 = VEC_MADD(QPX_q5, QPX_h1, QPX_x5)
+       QPX_x6 = VEC_MADD(QPX_q6, QPX_h1, QPX_x6)
+       QPX_h2 = VEC_SPLATS(hh(i,2))
+       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
+       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
+       QPX_y3 = VEC_MADD(QPX_q3, QPX_h2, QPX_y3)
+       QPX_y4 = VEC_MADD(QPX_q4, QPX_h2, QPX_y4)
+       QPX_y5 = VEC_MADD(QPX_q5, QPX_h2, QPX_y5)
+       QPX_y6 = VEC_MADD(QPX_q6, QPX_h2, QPX_y6)
+
+    enddo
+
+    QPX_h1 = VEC_SPLATS(hh(nb,1))
+    QPX_q1 = VEC_LD(0,q(1,nb+1))
+    QPX_q2 = VEC_LD(0,q(5,nb+1))
+    QPX_q3 = VEC_LD(0,q(9,nb+1))
+    QPX_q4 = VEC_LD(0,q(13,nb+1))
+    QPX_q5 = VEC_LD(0,q(17,nb+1))
+    QPX_q6 = VEC_LD(0,q(21,nb+1))
+    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
+    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
+    QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
+    QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
+    QPX_x5 = VEC_MADD(QPX_q5, QPX_h1, QPX_x5)
+    QPX_x6 = VEC_MADD(QPX_q6, QPX_h1, QPX_x6)
+
+    !--- multiply T matrix ---
+
+    QPX_tau1 = VEC_SPLATS(-hh(1,1))
+    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
+    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
+    QPX_x3 = VEC_MUL(QPX_x3, QPX_tau1)
+    QPX_x4 = VEC_MUL(QPX_x4, QPX_tau1)
+    QPX_x5 = VEC_MUL(QPX_x5, QPX_tau1)
+    QPX_x6 = VEC_MUL(QPX_x6, QPX_tau1)
+    QPX_tau2 = VEC_SPLATS(-hh(1,2))
+    QPX_s = VEC_SPLATS(-hh(1,2)*s)
+    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
+    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
+    QPX_y3 = VEC_MUL(QPX_y3, QPX_tau2)
+    QPX_y4 = VEC_MUL(QPX_y4, QPX_tau2)
+    QPX_y5 = VEC_MUL(QPX_y5, QPX_tau2)
+    QPX_y6 = VEC_MUL(QPX_y6, QPX_tau2)
+    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
+    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
+    QPX_y3 = VEC_MADD(QPX_x3, QPX_s, QPX_y3)
+    QPX_y4 = VEC_MADD(QPX_x4, QPX_s, QPX_y4)
+    QPX_y5 = VEC_MADD(QPX_x5, QPX_s, QPX_y5)
+    QPX_y6 = VEC_MADD(QPX_x6, QPX_s, QPX_y6)
+
+    !--- rank-2 update of q ---
+
+    QPX_q1 = VEC_LD(0,q(1,1))
+    QPX_q2 = VEC_LD(0,q(5,1))
+    QPX_q3 = VEC_LD(0,q(9,1))
+    QPX_q4 = VEC_LD(0,q(13,1))
+    QPX_q5 = VEC_LD(0,q(17,1))
+    QPX_q6 = VEC_LD(0,q(21,1))
+    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
+    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
+    QPX_q3 = VEC_ADD(QPX_q3, QPX_y3)
+    QPX_q4 = VEC_ADD(QPX_q4, QPX_y4)
+    QPX_q5 = VEC_ADD(QPX_q5, QPX_y5)
+    QPX_q6 = VEC_ADD(QPX_q6, QPX_y6)
+    call VEC_ST(QPX_q1, 0, q(1,1))
+    call VEC_ST(QPX_q2, 0, q(5,1))
+    call VEC_ST(QPX_q3, 0, q(9,1))
+    call VEC_ST(QPX_q4, 0, q(13,1))
+    call VEC_ST(QPX_q5, 0, q(17,1))
+    call VEC_ST(QPX_q6, 0, q(21,1))
+
+    QPX_h2 = VEC_SPLATS(hh(2,2))
+    QPX_q1 = VEC_LD(0,q(1,2))
+    QPX_q2 = VEC_LD(0,q(5,2))
+    QPX_q3 = VEC_LD(0,q(9,2))
+    QPX_q4 = VEC_LD(0,q(13,2))
+    QPX_q5 = VEC_LD(0,q(17,2))
+    QPX_q6 = VEC_LD(0,q(21,2))
+    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
+    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
+    QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
+    QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
+    QPX_q5 = VEC_MADD(QPX_y5, QPX_h2, QPX_q5)
+    QPX_q6 = VEC_MADD(QPX_y6, QPX_h2, QPX_q6)
+    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
+    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
+    QPX_q3 = VEC_ADD(QPX_q3, QPX_x3)
+    QPX_q4 = VEC_ADD(QPX_q4, QPX_x4)
+    QPX_q5 = VEC_ADD(QPX_q5, QPX_x5)
+    QPX_q6 = VEC_ADD(QPX_q6, QPX_x6)
+    call VEC_ST(QPX_q1, 0, q(1,2))
+    call VEC_ST(QPX_q2, 0, q(5,2))
+    call VEC_ST(QPX_q3, 0, q(9,2))
+    call VEC_ST(QPX_q4, 0, q(13,2))
+    call VEC_ST(QPX_q5, 0, q(17,2))
+    call VEC_ST(QPX_q6, 0, q(21,2))
+
+    do i=3,nb,1
+
+       QPX_q1 = VEC_LD(0,q(1,i))
+       QPX_q2 = VEC_LD(0,q(5,i))
+       QPX_q3 = VEC_LD(0,q(9,i))
+       QPX_q4 = VEC_LD(0,q(13,i))
+       QPX_q5 = VEC_LD(0,q(17,i))
+       QPX_q6 = VEC_LD(0,q(21,i))
+       QPX_h1 = VEC_SPLATS(hh(i-1,1))
+       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
+       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
+       QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
+       QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
+       QPX_q5 = VEC_MADD(QPX_x5, QPX_h1, QPX_q5)
+       QPX_q6 = VEC_MADD(QPX_x6, QPX_h1, QPX_q6)
+       QPX_h2 = VEC_SPLATS(hh(i,2))
+       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
+       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
+       QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
+       QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
+       QPX_q5 = VEC_MADD(QPX_y5, QPX_h2, QPX_q5)
+       QPX_q6 = VEC_MADD(QPX_y6, QPX_h2, QPX_q6)
+
+       call VEC_ST(QPX_q1, 0, q(1,i))
+       call VEC_ST(QPX_q2, 0, q(5,i))
+       call VEC_ST(QPX_q3, 0, q(9,i))
+       call VEC_ST(QPX_q4, 0, q(13,i))
+       call VEC_ST(QPX_q5, 0, q(17,i))
+       call VEC_ST(QPX_q6, 0, q(21,i))
+
+    enddo
+
+    QPX_h1 = VEC_SPLATS(hh(nb,1))
+    QPX_q1 = VEC_LD(0,q(1,nb+1))
+    QPX_q2 = VEC_LD(0,q(5,nb+1))
+    QPX_q3 = VEC_LD(0,q(9,nb+1))
+    QPX_q4 = VEC_LD(0,q(13,nb+1))
+    QPX_q5 = VEC_LD(0,q(17,nb+1))
+    QPX_q6 = VEC_LD(0,q(21,nb+1))
+    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
+    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
+    QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
+    QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
+    QPX_q5 = VEC_MADD(QPX_x5, QPX_h1, QPX_q5)
+    QPX_q6 = VEC_MADD(QPX_x6, QPX_h1, QPX_q6)
+    call VEC_ST(QPX_q1, 0, q(1,nb+1))
+    call VEC_ST(QPX_q2, 0, q(5,nb+1))
+    call VEC_ST(QPX_q3, 0, q(9,nb+1))
+    call VEC_ST(QPX_q4, 0, q(13,nb+1))
+    call VEC_ST(QPX_q5, 0, q(17,nb+1))
+    call VEC_ST(QPX_q6, 0, q(21,nb+1))
+
+  end subroutine hh_trafo_kernel_24_bgq
+
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_16_bgq(q, hh, nb, ldq, ldh, s)
+    use precision
+    use elpa_mpi
+    implicit none
+
+    integer(kind=ik), intent(in) :: nb, ldq, ldh
+
+    real(kind=rk8), intent(inout) :: q(ldq,*)
+    real(kind=rk8), intent(in)    :: hh(ldh,*), s
+
+    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_x3, QPX_x4
+    VECTOR(REAL(8))::QPX_y1, QPX_y2, QPX_y3, QPX_y4
+    VECTOR(REAL(8))::QPX_q1, QPX_q2, QPX_q3, QPX_q4
+    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
+    integer i
+
+    call alignx(32,q)
+
+    !--- multiply Householder vectors with matrix q ---
+
+    QPX_x1 = VEC_LD(0,q(1,2))
+    QPX_x2 = VEC_LD(0,q(5,2))
+    QPX_x3 = VEC_LD(0,q(9,2))
+    QPX_x4 = VEC_LD(0,q(13,2))
+
+    QPX_h2 = VEC_SPLATS(hh(2,2))
+    QPX_q1 = VEC_LD(0,q(1,1))
+    QPX_q2 = VEC_LD(0,q(5,1))
+    QPX_q3 = VEC_LD(0,q(9,1))
+    QPX_q4 = VEC_LD(0,q(13,1))
+    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
+    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
+    QPX_y3 = VEC_MADD(QPX_x3, QPX_h2, QPX_q3)
+    QPX_y4 = VEC_MADD(QPX_x4, QPX_h2, QPX_q4)
+
+    do i=3,nb,1
+
+       QPX_q1 = VEC_LD(0,q(1,i))
+       QPX_q2 = VEC_LD(0,q(5,i))
+       QPX_q3 = VEC_LD(0,q(9,i))
+       QPX_q4 = VEC_LD(0,q(13,i))
+       QPX_h1 = VEC_SPLATS(hh(i-1,1))
+       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
+       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
+       QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
+       QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
+       QPX_h2 = VEC_SPLATS(hh(i,2))
+       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
+       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
+       QPX_y3 = VEC_MADD(QPX_q3, QPX_h2, QPX_y3)
+       QPX_y4 = VEC_MADD(QPX_q4, QPX_h2, QPX_y4)
+
+    enddo
+
+    QPX_h1 = VEC_SPLATS(hh(nb,1))
+    QPX_q1 = VEC_LD(0,q(1,nb+1))
+    QPX_q2 = VEC_LD(0,q(5,nb+1))
+    QPX_q3 = VEC_LD(0,q(9,nb+1))
+    QPX_q4 = VEC_LD(0,q(13,nb+1))
+    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
+    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
+    QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
+    QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
+
+    !--- multiply T matrix ---
+
+    QPX_tau1 = VEC_SPLATS(-hh(1,1))
+    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
+    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
+    QPX_x3 = VEC_MUL(QPX_x3, QPX_tau1)
+    QPX_x4 = VEC_MUL(QPX_x4, QPX_tau1)
+    QPX_tau2 = VEC_SPLATS(-hh(1,2))
+    QPX_s = VEC_SPLATS(-hh(1,2)*s)
+    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
+    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
+    QPX_y3 = VEC_MUL(QPX_y3, QPX_tau2)
+    QPX_y4 = VEC_MUL(QPX_y4, QPX_tau2)
+    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
+    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
+    QPX_y3 = VEC_MADD(QPX_x3, QPX_s, QPX_y3)
+    QPX_y4 = VEC_MADD(QPX_x4, QPX_s, QPX_y4)
+
+    !--- rank-2 update of q ---
+
+    QPX_q1 = VEC_LD(0,q(1,1))
+    QPX_q2 = VEC_LD(0,q(5,1))
+    QPX_q3 = VEC_LD(0,q(9,1))
+    QPX_q4 = VEC_LD(0,q(13,1))
+    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
+    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
+    QPX_q3 = VEC_ADD(QPX_q3, QPX_y3)
+    QPX_q4 = VEC_ADD(QPX_q4, QPX_y4)
+    call VEC_ST(QPX_q1, 0, q(1,1))
+    call VEC_ST(QPX_q2, 0, q(5,1))
+    call VEC_ST(QPX_q3, 0, q(9,1))
+    call VEC_ST(QPX_q4, 0, q(13,1))
+
+    QPX_h2 = VEC_SPLATS(hh(2,2))
+    QPX_q1 = VEC_LD(0,q(1,2))
+    QPX_q2 = VEC_LD(0,q(5,2))
+    QPX_q3 = VEC_LD(0,q(9,2))
+    QPX_q4 = VEC_LD(0,q(13,2))
+    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
+    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
+    QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
+    QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
+    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
+    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
+    QPX_q3 = VEC_ADD(QPX_q3, QPX_x3)
+    QPX_q4 = VEC_ADD(QPX_q4, QPX_x4)
+    call VEC_ST(QPX_q1, 0, q(1,2))
+    call VEC_ST(QPX_q2, 0, q(5,2))
+    call VEC_ST(QPX_q3, 0, q(9,2))
+    call VEC_ST(QPX_q4, 0, q(13,2))
+
+    do i=3,nb,1
+
+       QPX_q1 = VEC_LD(0,q(1,i))
+       QPX_q2 = VEC_LD(0,q(5,i))
+       QPX_q3 = VEC_LD(0,q(9,i))
+       QPX_q4 = VEC_LD(0,q(13,i))
+       QPX_h1 = VEC_SPLATS(hh(i-1,1))
+       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
+       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
+       QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
+       QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
+       QPX_h2 = VEC_SPLATS(hh(i,2))
+       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
+       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
+       QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
+       QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
+
+       call VEC_ST(QPX_q1, 0, q(1,i))
+       call VEC_ST(QPX_q2, 0, q(5,i))
+       call VEC_ST(QPX_q3, 0, q(9,i))
+       call VEC_ST(QPX_q4, 0, q(13,i))
+
+    enddo
+
+    QPX_h1 = VEC_SPLATS(hh(nb,1))
+    QPX_q1 = VEC_LD(0,q(1,nb+1))
+    QPX_q2 = VEC_LD(0,q(5,nb+1))
+    QPX_q3 = VEC_LD(0,q(9,nb+1))
+    QPX_q4 = VEC_LD(0,q(13,nb+1))
+    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
+    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
+    QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
+    QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
+    call VEC_ST(QPX_q1, 0, q(1,nb+1))
+    call VEC_ST(QPX_q2, 0, q(5,nb+1))
+    call VEC_ST(QPX_q3, 0, q(9,nb+1))
+    call VEC_ST(QPX_q4, 0, q(13,nb+1))
+
+  end subroutine hh_trafo_kernel_16_bgq
+
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_8_bgq(q, hh, nb, ldq, ldh, s)
+    use precision
+    use elpa_mpi
+    implicit none
+
+    integer(kind=ik), intent(in) :: nb, ldq, ldh
+
+    real(kind=rk8), intent(inout) :: q(ldq,*)
+    real(kind=rk8), intent(in)    :: hh(ldh,*), s
+    integer(kind=ik)             :: i
+    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_y1, QPX_y2
+    VECTOR(REAL(8))::QPX_q1, QPX_q2
+    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
+
+
+    call alignx(32,q)
+
+    !--- multiply Householder vectors with matrix q ---
+
+    QPX_x1 = VEC_LD(0,q(1,2))
+    QPX_x2 = VEC_LD(0,q(5,2))
+
+    QPX_h2 = VEC_SPLATS(hh(2,2))
+    QPX_q1 = VEC_LD(0,q(1,1))
+    QPX_q2 = VEC_LD(0,q(5,1))
+    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
+    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
+
+    do i=3,nb,1
+
+       QPX_q1 = VEC_LD(0,q(1,i))
+       QPX_q2 = VEC_LD(0,q(5,i))
+       QPX_h1 = VEC_SPLATS(hh(i-1,1))
+       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
+       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
+       QPX_h2 = VEC_SPLATS(hh(i,2))
+       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
+       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
+
+    enddo
+
+    QPX_h1 = VEC_SPLATS(hh(nb,1))
+    QPX_q1 = VEC_LD(0,q(1,nb+1))
+    QPX_q2 = VEC_LD(0,q(5,nb+1))
+    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
+    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
+
+    !--- multiply T matrix ---
+
+    QPX_tau1 = VEC_SPLATS(-hh(1,1))
+    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
+    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
+    QPX_tau2 = VEC_SPLATS(-hh(1,2))
+    QPX_s = VEC_SPLATS(-hh(1,2)*s)
+    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
+    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
+    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
+    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
+
+    !--- rank-2 update of q ---
+
+    QPX_q1 = VEC_LD(0,q(1,1))
+    QPX_q2 = VEC_LD(0,q(5,1))
+    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
+    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
+    call VEC_ST(QPX_q1, 0, q(1,1))
+    call VEC_ST(QPX_q2, 0, q(5,1))
+
+    QPX_h2 = VEC_SPLATS(hh(2,2))
+    QPX_q1 = VEC_LD(0,q(1,2))
+    QPX_q2 = VEC_LD(0,q(5,2))
+    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
+    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
+    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
+    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
+    call VEC_ST(QPX_q1, 0, q(1,2))
+    call VEC_ST(QPX_q2, 0, q(5,2))
+
+    do i=3,nb,1
+
+       QPX_q1 = VEC_LD(0,q(1,i))
+       QPX_q2 = VEC_LD(0,q(5,i))
+       QPX_h1 = VEC_SPLATS(hh(i-1,1))
+       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
+       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
+       QPX_h2 = VEC_SPLATS(hh(i,2))
+       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
+       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
+
+       call VEC_ST(QPX_q1, 0, q(1,i))
+       call VEC_ST(QPX_q2, 0, q(5,i))
+
+    enddo
+
+    QPX_h1 = VEC_SPLATS(hh(nb,1))
+    QPX_q1 = VEC_LD(0,q(1,nb+1))
+    QPX_q2 = VEC_LD(0,q(5,nb+1))
+    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
+    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
+    call VEC_ST(QPX_q1, 0, q(1,nb+1))
+    call VEC_ST(QPX_q2, 0, q(5,nb+1))
+
+  end subroutine hh_trafo_kernel_8_bgq
+
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_4_bgq(q, hh, nb, ldq, ldh, s)
+    use precision
+    use elpa_mpi
+    implicit none
+
+    integer(kind=ik), intent(in) :: nb, ldq, ldh
+
+    real(kind=rk8), intent(inout) :: q(ldq,*)
+    real(kind=rk8), intent(in)    :: hh(ldh,*), s
+    integer(kind=ik)             :: i
+    VECTOR(REAL(8))::QPX_x1, QPX_y1
+    VECTOR(REAL(8))::QPX_q1
+    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
+
+    call alignx(32,q)
+
+    !--- multiply Householder vectors with matrix q ---
+
+    QPX_x1 = VEC_LD(0,q(1,2))
+
+    QPX_h2 = VEC_SPLATS(hh(2,2))
+    QPX_q1 = VEC_LD(0,q(1,1))
+    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
+
+    do i=3,nb,1
+
+       QPX_q1 = VEC_LD(0,q(1,i))
+       QPX_h1 = VEC_SPLATS(hh(i-1,1))
+       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
+       QPX_h2 = VEC_SPLATS(hh(i,2))
+       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
+
+    enddo
+
+    QPX_h1 = VEC_SPLATS(hh(nb,1))
+    QPX_q1 = VEC_LD(0,q(1,nb+1))
+    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
+
+    !--- multiply T matrix ---
+
+    QPX_tau1 = VEC_SPLATS(-hh(1,1))
+    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
+    QPX_tau2 = VEC_SPLATS(-hh(1,2))
+    QPX_s = VEC_SPLATS(-hh(1,2)*s)
+    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
+    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
+
+    !--- rank-2 update of q ---
+
+    QPX_q1 = VEC_LD(0,q(1,1))
+    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
+    call VEC_ST(QPX_q1, 0, q(1,1))
+
+    QPX_h2 = VEC_SPLATS(hh(2,2))
+    QPX_q1 = VEC_LD(0,q(1,2))
+    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
+    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
+    call VEC_ST(QPX_q1, 0, q(1,2))
+
+    do i=3,nb,1
+
+       QPX_q1 = VEC_LD(0,q(1,i))
+       QPX_h1 = VEC_SPLATS(hh(i-1,1))
+       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
+       QPX_h2 = VEC_SPLATS(hh(i,2))
+       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
+
+       call VEC_ST(QPX_q1, 0, q(1,i))
+
+    enddo
+
+    QPX_h1 = VEC_SPLATS(hh(nb,1))
+    QPX_q1 = VEC_LD(0,q(1,nb+1))
+    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
+    call VEC_ST(QPX_q1, 0, q(1,nb+1))
+
+  end subroutine hh_trafo_kernel_4_bgq
+end module real_bgq_kernel
+! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real.F90 elpa-2019.11.001/src/elpa2/kernels/real.F90
--- elpa-2016.05.001/src/elpa2/kernels/real.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,95 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+! It should be compiled with the highest possible optimization level.
+!
+! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+#include "config-f90.h"
+#ifdef USE_ASSUMED_SIZE
+#define PACK_REAL_TO_COMPLEX
+#else
+#undef PACK_REAL_TO_COMPLEX
+#endif
+
+#ifndef USE_ASSUMED_SIZE
+module real_generic_kernel
+
+  private
+  public double_hh_trafo_real_generic_double
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public double_hh_trafo_real_generic_single
+#endif
+
+  contains
+#endif
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "real_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "real_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+#ifndef USE_ASSUMED_SIZE
+end module real_generic_kernel
+#endif
+! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET NEON_ARCH64_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET NEON_ARCH64_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK4 1
+#define VEC_SET NEON_ARCH64_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK4
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK4 1
+#define VEC_SET NEON_ARCH64_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK4
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET NEON_ARCH64_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef VEC_SET
+#undef BLOCK6
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET NEON_ARCH64_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef VEC_SET
+#undef BLOCK6
+#undef SINGLE__PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_simple_block4.F90 elpa-2019.11.001/src/elpa2/kernels/real_simple_block4.F90
--- elpa-2016.05.001/src/elpa2/kernels/real_simple_block4.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_simple_block4.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,94 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! This is the small and simple version (no hand unrolling of loops etc.) but for some
+! compilers this performs better than a sophisticated version with transformed and unrolled loops.
+!
+! It should be compiled with the highest possible optimization level.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+#endif
+#include "config-f90.h"
+
+#ifndef USE_ASSUMED_SIZE
+module real_generic_simple_block4_kernel
+
+  private
+  public quad_hh_trafo_real_generic_simple_4hv_double
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+ public quad_hh_trafo_real_generic_simple_4hv_single
+#endif
+
+  contains
+#endif
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "simple_block4_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "simple_block4_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+#ifndef USE_ASSUMED_SIZE
+end module real_generic_simple_block4_kernel
+#endif
+! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_simple_block6.F90 elpa-2019.11.001/src/elpa2/kernels/real_simple_block6.F90
--- elpa-2016.05.001/src/elpa2/kernels/real_simple_block6.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_simple_block6.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,95 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! This is the small and simple version (no hand unrolling of loops etc.) but for some
+! compilers this performs better than a sophisticated version with transformed and unrolled loops.
+!
+! It should be compiled with the highest possible optimization level.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: A. Marek, MPCDF
+! --------------------------------------------------------------------------------------------------
+#endif
+#include "config-f90.h"
+
+!#ifndef USE_ASSUMED_SIZE
+!module real_generic_simple_block6_kernel
+!
+!  private
+!  public hexa_hh_trafo_real_generic_simple_6hv_double
+!
+!#ifdef WANT_SINGLE_PRECISION_REAL
+! public hexa_hh_trafo_real_generic_simple_6hv_single
+!#endif
+!
+!  contains
+!#endif
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "simple_block6_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "simple_block6_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+!#ifndef USE_ASSUMED_SIZE
+!end module real_generic_simple_block6_kernel
+!#endif
+! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_simple.F90 elpa-2019.11.001/src/elpa2/kernels/real_simple.F90
--- elpa-2016.05.001/src/elpa2/kernels/real_simple.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_simple.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,93 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! This is the small and simple version (no hand unrolling of loops etc.) but for some
+! compilers this performs better than a sophisticated version with transformed and unrolled loops.
+!
+! It should be compiled with the highest possible optimization level.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+
+#include "config-f90.h"
+
+#ifndef USE_ASSUMED_SIZE
+module real_generic_simple_kernel
+
+  private
+  public double_hh_trafo_real_generic_simple_double
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+ public double_hh_trafo_real_generic_simple_single
+#endif
+
+  contains
+#endif
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "simple_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "simple_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+#ifndef USE_ASSUMED_SIZE
+end module real_generic_simple_kernel
+#endif
+! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sparc64_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sparc64_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sparc64_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sparc64_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET SPARC64_SSE
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef VEC_SET
+#undef BLOCK2
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sparc64_4hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sparc64_4hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sparc64_4hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sparc64_4hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK4 1
+#define VEC_SET SPARC64_SSE
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK4
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sparc64_6hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sparc64_6hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sparc64_6hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sparc64_6hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET SPARC64_SSE
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef VEC_SET
+#undef BLOCK6
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sse_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sse_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sse_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sse_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET SSE_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sse_2hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sse_2hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sse_2hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sse_2hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET SSE_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sse_4hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sse_4hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sse_4hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sse_4hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK4 1
+#define VEC_SET SSE_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK4
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sse_4hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sse_4hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sse_4hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sse_4hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK4 1
+#define VEC_SET SSE_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK4
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sse_6hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sse_6hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sse_6hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sse_6hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET SSE_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef VEC_SET
+#undef BLOCK6
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_sse_6hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_sse_6hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_sse_6hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_sse_6hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET SSE_128
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef REALCASE
+#undef VEC_SET
+#undef BLOCK6
+#undef SINGLE__PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_template.F90 elpa-2019.11.001/src/elpa2/kernels/real_template.F90
--- elpa-2016.05.001/src/elpa2/kernels/real_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,708 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+! It should be compiled with the highest possible optimization level.
+!
+! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+#endif
+
+! the intel compiler creates a temp copy of array q!
+! this should be prevented if possible without using assumed size arrays
+  subroutine double_hh_trafo_&
+  &MATH_DATATYPE&
+  &_generic_&
+  &PRECISION&
+  & (q, hh, nb, nq, ldq, ldh)
+
+    use precision
+    use iso_c_binding
+    use elpa_abstract_impl
+    implicit none
+
+    !class(elpa_abstract_impl_t), intent(inout) :: obj
+    integer(kind=ik), intent(in)            :: nb, nq, ldq, ldh
+#ifdef USE_ASSUMED_SIZE
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+1)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(1:ldh,1:6)
+#endif
+
+    real(kind=C_DATATYPE_KIND)                :: s
+    integer(kind=ik)                        :: i
+
+!    equivalence(q(1,1),q_complex(1,1))
+
+    ! Safety only:
+
+!    call obj%timer%start("kernel generic: double_hh_trafo_&
+!                     &MATH_DATATYPE&
+!		     &_generic" // &
+!		     &PRECISION_SUFFIX &
+!		     )
+
+    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
+
+    ! Calculate dot product of the two Householder vectors
+
+    s = hh(2,2)*1.0
+    do i=3,nb
+       s = s+hh(i,2)*hh(i-1,1)
+    enddo
+
+    ! Do the Householder transformations
+
+#ifndef USE_ASSUMED_SIZE
+!    ! assign real data to compplex pointer
+!    call c_f_pointer(c_loc(q), q_complex, [size(q,dim=1)/2,size(q,dim=2)])
+#endif
+    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
+
+    do i=1,nq-8,12
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_kernel_12_generic_&
+       &PRECISION&
+       & (q(i,1),hh, nb, ldq, ldh, s)
+#else
+       call hh_trafo_kernel_12_generic_&
+       &PRECISION&
+       & (q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+    enddo
+
+    ! i > nq-8 now, i.e. at most 8 rows remain
+
+    if(nq-i+1 > 4) then
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_kernel_8_generic_&
+       &PRECISION&
+       & (q(i,1),hh, nb, ldq, ldh, s)
+#else
+       call hh_trafo_kernel_8_generic_&
+       &PRECISION&
+       & (q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+    else if(nq-i+1 > 0) then
+
+#ifdef USE_ASSUMED_SIZE
+       call hh_trafo_kernel_4_generic_&
+       &PRECISION&
+       & (q(i,1),hh, nb, ldq, ldh, s)
+#else
+       call hh_trafo_kernel_4_generic_&
+       &PRECISION&
+       & (q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
+#endif
+
+    endif
+
+!    call obj%timer%stop("kernel generic: double_hh_trafo_&
+!                     &MATH_DATATYPE&
+!		     &_generic" // &
+!		     &PRECISION_SUFFIX &
+!		     )
+
+  end subroutine
+
+  ! --------------------------------------------------------------------------------------------------
+  ! The following kernels perform the Householder transformation on Q for 12/8/4 rows.
+  ! Please note that Q is declared complex*16 here.
+  ! This is a hint for compilers that packed arithmetic can be used for Q
+  ! (relevant for Intel SSE and BlueGene double hummer CPUs).
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_12_generic_&
+  &PRECISION&
+  & (q, hh, nb, ldq, ldh, s)
+    use precision
+    implicit none
+    integer(kind=ik), intent(in)    :: nb, ldq, ldh
+#ifdef USE_ASSUMED_SIZE
+#ifdef PACK_REAL_TO_COMPLEX
+    complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+#endif
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(:,:)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,2)
+#endif
+    real(kind=C_DATATYPE_KIND), intent(in)    :: s
+
+#ifdef PACK_REAL_TO_COMPLEX
+    complex(kind=SPECIAL_COMPLEX_DATATYPE)    :: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6
+#else
+    real(kind=C_DATATYPE_KIND)                :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
+                                       y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
+#endif
+    real(kind=C_DATATYPE_KIND)                   :: h1, h2, tau1, tau2
+    integer(kind=ik)                :: i
+
+
+    !call obj%timer%start("kernel generic: hh_trafo_kernel_12_generic" // &
+    !		     &PRECISION_SUFFIX &
+    !		     )
+
+    x1  = q(1,2)
+    x2  = q(2,2)
+    x3  = q(3,2)
+    x4  = q(4,2)
+    x5  = q(5,2)
+    x6  = q(6,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    x7  = q(7,2)
+    x8  = q(8,2)
+    x9  = q(9,2)
+    x10 = q(10,2)
+    x11 = q(11,2)
+    x12 = q(12,2)
+#endif
+
+    y1  = q(1 ,1) + q(1, 2)*hh(2,2)
+    y2  = q(2 ,1) + q(2, 2)*hh(2,2)
+    y3  = q(3 ,1) + q(3, 2)*hh(2,2)
+    y4  = q(4 ,1) + q(4, 2)*hh(2,2)
+    y5  = q(5 ,1) + q(5, 2)*hh(2,2)
+    y6  = q(6 ,1) + q(6, 2)*hh(2,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    y7  = q(7 ,1) + q(7, 2)*hh(2,2)
+    y8  = q(8 ,1) + q(8, 2)*hh(2,2)
+    y9  = q(9 ,1) + q(9, 2)*hh(2,2)
+    y10 = q(10,1) + q(10,2)*hh(2,2)
+    y11 = q(11,1) + q(11,2)*hh(2,2)
+    y12 = q(12,1) + q(12,2)*hh(2,2)
+#endif
+
+#ifdef DOUBLE_PRECISION_REAL
+#if defined(SSE_ALIGNED)
+    !DEC$ VECTOR ALIGNED
+#endif
+#endif
+
+    do i=3,nb
+       h1  = hh(i-1,1)
+       h2  = hh(i,2)
+       x1  = x1 + q(1, i)*h1
+       y1  = y1 + q(1, i)*h2
+       x2  = x2 + q(2, i)*h1
+       y2  = y2 + q(2, i)*h2
+       x3  = x3 + q(3, i)*h1
+       y3  = y3 + q(3, i)*h2
+       x4  = x4 + q(4, i)*h1
+       y4  = y4 + q(4, i)*h2
+       x5  = x5 + q(5, i)*h1
+       y5  = y5 + q(5, i)*h2
+       x6  = x6 + q(6, i)*h1
+       y6  = y6 + q(6, i)*h2
+#ifndef PACK_REAL_TO_COMPLEX
+       x7  = x7  + q(7, i)*h1
+       y7  = y7  + q(7, i)*h2
+       x8  = x8  + q(8, i)*h1
+       y8  = y8  + q(8, i)*h2
+       x9  = x9  + q(9, i)*h1
+       y9  = y9  + q(9, i)*h2
+       x10 = x10 + q(10,i)*h1
+       y10 = y10 + q(10,i)*h2
+       x11 = x11 + q(11,i)*h1
+       y11 = y11 + q(11,i)*h2
+       x12 = x12 + q(12,i)*h1
+       y12 = y12 + q(12,i)*h2
+#endif
+    enddo
+
+    x1  = x1  + q(1,nb+1)*hh(nb,1)
+    x2  = x2  + q(2,nb+1)*hh(nb,1)
+    x3  = x3  + q(3,nb+1)*hh(nb,1)
+    x4  = x4  + q(4,nb+1)*hh(nb,1)
+    x5  = x5  + q(5,nb+1)*hh(nb,1)
+    x6  = x6  + q(6,nb+1)*hh(nb,1)
+#ifndef PACK_REAL_TO_COMPLEX
+    x7  = x7  + q(7, nb+1)*hh(nb,1)
+    x8  = x8  + q(8, nb+1)*hh(nb,1)
+    x9  = x9  + q(9, nb+1)*hh(nb,1)
+    x10 = x10 + q(10,nb+1)*hh(nb,1)
+    x11 = x11 + q(11,nb+1)*hh(nb,1)
+    x12 = x12 + q(12,nb+1)*hh(nb,1)
+
+#endif
+
+    tau1 = hh(1,1)
+    tau2 = hh(1,2)
+
+    h1  = -tau1
+    x1  = x1 *h1
+    x2  = x2 *h1
+    x3  = x3 *h1
+    x4  = x4 *h1
+    x5  = x5 *h1
+    x6  = x6 *h1
+#ifndef PACK_REAL_TO_COMPLEX
+    x7  = x7 *h1
+    x8  = x8 *h1
+    x9  = x9 *h1
+    x10 = x10*h1
+    x11 = x11*h1
+    x12 = x12*h1
+#endif
+
+    h1  = -tau2
+    h2  = -tau2*s
+    y1  = y1 *h1 + x1 *h2
+    y2  = y2 *h1 + x2 *h2
+    y3  = y3 *h1 + x3 *h2
+    y4  = y4 *h1 + x4 *h2
+    y5  = y5 *h1 + x5 *h2
+    y6  = y6 *h1 + x6 *h2
+#ifndef PACK_REAL_TO_COMPLEX
+    y7  = y7 *h1 + x7 *h2
+    y8  = y8 *h1 + x8 *h2
+    y9  = y9 *h1 + x9 *h2
+    y10 = y10*h1 + x10*h2
+    y11 = y11*h1 + x11*h2
+    y12 = y12*h1 + x12*h2
+#endif
+    q(1,1)  = q(1, 1) + y1
+    q(2,1)  = q(2, 1) + y2
+    q(3,1)  = q(3, 1) + y3
+    q(4,1)  = q(4, 1) + y4
+    q(5,1)  = q(5, 1) + y5
+    q(6,1)  = q(6, 1) + y6
+#ifndef PACK_REAL_TO_COMPLEX
+    q(7 ,1) = q(7, 1) + y7
+    q(8 ,1) = q(8, 1) + y8
+    q(9 ,1) = q(9, 1) + y9
+    q(10,1) = q(10,1) + y10
+    q(11,1) = q(11,1) + y11
+    q(12,1) = q(12,1) + y12
+#endif
+
+    q(1, 2) = q(1, 2) + x1  + y1 *hh(2,2)
+    q(2, 2) = q(2, 2) + x2  + y2 *hh(2,2)
+    q(3, 2) = q(3, 2) + x3  + y3 *hh(2,2)
+    q(4, 2) = q(4, 2) + x4  + y4 *hh(2,2)
+    q(5, 2) = q(5, 2) + x5  + y5 *hh(2,2)
+    q(6, 2) = q(6, 2) + x6  + y6 *hh(2,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    q(7, 2) = q(7, 2) + x7  + y7 *hh(2,2)
+    q(8, 2) = q(8, 2) + x8  + y8 *hh(2,2)
+    q(9, 2) = q(9, 2) + x9  + y9 *hh(2,2)
+    q(10,2) = q(10,2) + x10 + y10*hh(2,2)
+    q(11,2) = q(11,2) + x11 + y11*hh(2,2)
+    q(12,2) = q(12,2) + x12 + y12*hh(2,2)
+#endif
+
+#ifdef DOUBLE_PRECISION_REAL
+#if defined(SSE_ALIGNED)
+    !DEC$ VECTOR ALIGNED
+#endif
+#endif
+
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       q(1, i) = q(1,i)  + x1 *h1 + y1 *h2
+       q(2, i) = q(2,i)  + x2 *h1 + y2 *h2
+       q(3, i) = q(3,i)  + x3 *h1 + y3 *h2
+       q(4, i) = q(4,i)  + x4 *h1 + y4 *h2
+       q(5, i) = q(5,i)  + x5 *h1 + y5 *h2
+       q(6, i) = q(6,i)  + x6 *h1 + y6 *h2
+#ifndef PACK_REAL_TO_COMPLEX
+       q(7, i) = q(7, i) + x7 *h1 + y7 *h2
+       q(8, i) = q(8, i) + x8 *h1 + y8 *h2
+       q(9, i) = q(9, i) + x9 *h1 + y9 *h2
+       q(10,i) = q(10,i) + x10*h1 + y10*h2
+       q(11,i) = q(11,i) + x11*h1 + y11*h2
+       q(12,i) = q(12,i) + x12*h1 + y12*h2
+#endif
+    enddo
+
+    q(1, nb+1) = q(1, nb+1) + x1 *hh(nb,1)
+    q(2, nb+1) = q(2, nb+1) + x2 *hh(nb,1)
+    q(3, nb+1) = q(3, nb+1) + x3 *hh(nb,1)
+    q(4, nb+1) = q(4, nb+1) + x4 *hh(nb,1)
+    q(5, nb+1) = q(5, nb+1) + x5 *hh(nb,1)
+    q(6, nb+1) = q(6, nb+1) + x6 *hh(nb,1)
+#ifndef PACK_REAL_TO_COMPLEX
+    q(7, nb+1) = q(7, nb+1) + x7 *hh(nb,1)
+    q(8, nb+1) = q(8, nb+1) + x8 *hh(nb,1)
+    q(9, nb+1) = q(9, nb+1) + x9 *hh(nb,1)
+    q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
+    q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
+    q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
+#endif
+
+!    call obj%timer%stop("kernel generic: hh_trafo_kernel_12_generic" // &
+!		     &PRECISION_SUFFIX &
+!		     )
+
+  end subroutine
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_8_generic_&
+  &PRECISION&
+  & (q, hh, nb, ldq, ldh, s)
+    use precision
+    implicit none
+    integer(kind=ik), intent(in)     :: nb, ldq, ldh
+#ifdef USE_ASSUMED_SIZE
+#ifdef PACK_REAL_TO_COMPLEX
+    complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout)  :: q(ldq/2,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout)  :: q(ldq,*)
+#endif
+    real(kind=C_DATATYPE_KIND), intent(in)     :: hh(ldh,*)
+#else
+    real(kind=C_DATATYPE_KIND),   intent(inout):: q(:,:)
+    real(kind=C_DATATYPE_KIND), intent(in)     :: hh(ldh,2)
+#endif
+    real(kind=C_DATATYPE_KIND), intent(in)     :: s
+#ifdef PACK_REAL_TO_COMPLEX
+    complex(kind=SPECIAL_COMPLEX_DATATYPE)     :: x1, x2, x3, x4, y1, y2, y3, y4
+#else
+    real(kind=C_DATATYPE_KIND)                 :: x1, x2, x3, x4, x5, x6, x7, x8, &
+                                        y1, y2, y3, y4, y5, y6, y7, y8
+#endif
+    real(kind=C_DATATYPE_KIND)                    :: h1, h2, tau1, tau2
+    integer(kind=ik)                 :: i
+
+!    call obj%timer%start("kernel generic: hh_trafo_kernel_8_generic" // &
+!		     &PRECISION_SUFFIX &
+!		     )
+    x1 = q(1,2)
+    x2 = q(2,2)
+    x3 = q(3,2)
+    x4 = q(4,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    x5 = q(5,2)
+    x6 = q(6,2)
+    x7 = q(7,2)
+    x8 = q(8,2)
+#endif
+
+    y1 = q(1,1) + q(1,2)*hh(2,2)
+    y2 = q(2,1) + q(2,2)*hh(2,2)
+    y3 = q(3,1) + q(3,2)*hh(2,2)
+    y4 = q(4,1) + q(4,2)*hh(2,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    y5 = q(5,1) + q(5,2)*hh(2,2)
+    y6 = q(6,1) + q(6,2)*hh(2,2)
+    y7 = q(7,1) + q(7,2)*hh(2,2)
+    y8 = q(8,1) + q(8,2)*hh(2,2)
+#endif
+
+#ifdef DOUBLE_PRECISION_REAL
+#if defined(SSE_ALIGNED)
+    !DEC$ VECTOR ALIGNED
+#endif
+#endif
+
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       x1 = x1 + q(1,i)*h1
+       y1 = y1 + q(1,i)*h2
+       x2 = x2 + q(2,i)*h1
+       y2 = y2 + q(2,i)*h2
+       x3 = x3 + q(3,i)*h1
+       y3 = y3 + q(3,i)*h2
+       x4 = x4 + q(4,i)*h1
+       y4 = y4 + q(4,i)*h2
+#ifndef PACK_REAL_TO_COMPLEX
+       x5 = x5 + q(5,i)*h1
+       y5 = y5 + q(5,i)*h2
+       x6 = x6 + q(6,i)*h1
+       y6 = y6 + q(6,i)*h2
+       x7 = x7 + q(7,i)*h1
+       y7 = y7 + q(7,i)*h2
+       x8 = x8 + q(8,i)*h1
+       y8 = y8 + q(8,i)*h2
+#endif
+    enddo
+
+    x1 = x1 + q(1,nb+1)*hh(nb,1)
+    x2 = x2 + q(2,nb+1)*hh(nb,1)
+    x3 = x3 + q(3,nb+1)*hh(nb,1)
+    x4 = x4 + q(4,nb+1)*hh(nb,1)
+#ifndef PACK_REAL_TO_COMPLEX
+    x5 = x5 + q(5,nb+1)*hh(nb,1)
+    x6 = x6 + q(6,nb+1)*hh(nb,1)
+    x7 = x7 + q(7,nb+1)*hh(nb,1)
+    x8 = x8 + q(8,nb+1)*hh(nb,1)
+#endif
+
+    tau1 = hh(1,1)
+    tau2 = hh(1,2)
+
+    h1 = -tau1
+    x1 = x1*h1
+    x2 = x2*h1
+    x3 = x3*h1
+    x4 = x4*h1
+#ifndef PACK_REAL_TO_COMPLEX
+    x5 = x5*h1
+    x6 = x6*h1
+    x7 = x7*h1
+    x8 = x8*h1
+#endif
+    h1 = -tau2
+    h2 = -tau2*s
+    y1 = y1*h1 + x1*h2
+    y2 = y2*h1 + x2*h2
+    y3 = y3*h1 + x3*h2
+    y4 = y4*h1 + x4*h2
+#ifndef PACK_REAL_TO_COMPLEX
+    y5 = y5*h1 + x5*h2
+    y6 = y6*h1 + x6*h2
+    y7 = y7*h1 + x7*h2
+    y8 = y8*h1 + x8*h2
+#endif
+    q(1,1) = q(1,1) + y1
+    q(2,1) = q(2,1) + y2
+    q(3,1) = q(3,1) + y3
+    q(4,1) = q(4,1) + y4
+#ifndef PACK_REAL_TO_COMPLEX
+    q(5,1) = q(5,1) + y5
+    q(6,1) = q(6,1) + y6
+    q(7,1) = q(7,1) + y7
+    q(8,1) = q(8,1) + y8
+#endif
+    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
+    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
+    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
+    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
+    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
+    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
+    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
+#endif
+
+#ifdef DOUBLE_PRECISION_REAL
+#if defined(SSE_ALIGNED)
+    !DEC$ VECTOR ALIGNED
+#endif
+#endif
+
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       q(1,i) = q(1,i) + x1*h1 + y1*h2
+       q(2,i) = q(2,i) + x2*h1 + y2*h2
+       q(3,i) = q(3,i) + x3*h1 + y3*h2
+       q(4,i) = q(4,i) + x4*h1 + y4*h2
+#ifndef PACK_REAL_TO_COMPLEX
+       q(5,i) = q(5,i) + x5*h1 + y5*h2
+       q(6,i) = q(6,i) + x6*h1 + y6*h2
+       q(7,i) = q(7,i) + x7*h1 + y7*h2
+       q(8,i) = q(8,i) + x8*h1 + y8*h2
+#endif
+    enddo
+
+    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
+    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
+    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
+    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
+#ifndef PACK_REAL_TO_COMPLEX
+    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
+    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
+    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
+    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
+#endif
+
+
+!    call obj%timer%stop("kernel generic: hh_trafo_kernel_8_generic" // &
+!		     &PRECISION_SUFFIX &
+!		     )
+
+  end subroutine
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine hh_trafo_kernel_4_generic_&
+  &PRECISION&
+  & (q, hh, nb, ldq, ldh, s)
+
+    use precision
+    implicit none
+    integer(kind=ik), intent(in)    :: nb, ldq, ldh
+#ifdef USE_ASSUMED_SIZE
+#ifdef PACK_REAL_TO_COMPLEX
+    complex(kind=SPECIAL_COMPLEX_DATATYPE), intent(inout) :: q(ldq/2,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+#endif
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(:,:) !q(1:ldq/2,1:nb+1)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,2)
+#endif
+    real(kind=C_DATATYPE_KIND), intent(in)    :: s
+
+#ifdef PACK_REAL_TO_COMPLEX
+    complex(kind=SPECIAL_COMPLEX_DATATYPE)    :: x1, x2, y1, y2
+#else
+    real(kind=C_DATATYPE_KIND)                :: x1, x2, x3, x4, y1, y2, y3, y4
+#endif
+    real(kind=C_DATATYPE_KIND)                :: h1, h2, tau1, tau2
+    integer(kind=ik)                :: i
+
+!    call obj%timer%start("kernel generic: hh_trafo_kernel_4_generic" // &
+!		     &PRECISION_SUFFIX &
+!		     )
+    x1 = q(1,2)
+    x2 = q(2,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    x3 = q(3,2)
+    x4 = q(4,2)
+#endif
+
+    y1 = q(1,1) + q(1,2)*hh(2,2)
+    y2 = q(2,1) + q(2,2)*hh(2,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    y3 = q(3,1) + q(3,2)*hh(2,2)
+    y4 = q(4,1) + q(4,2)*hh(2,2)
+#endif
+
+#ifdef DOUBLE_PRECISION_REAL
+#if defined(SSE_ALIGNED)
+    !DEC$ VECTOR ALIGNED
+#endif
+#endif
+
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       x1 = x1 + q(1,i)*h1
+       y1 = y1 + q(1,i)*h2
+       x2 = x2 + q(2,i)*h1
+       y2 = y2 + q(2,i)*h2
+#ifndef PACK_REAL_TO_COMPLEX
+       x3 = x3 + q(3,i)*h1
+       y3 = y3 + q(3,i)*h2
+       x4 = x4 + q(4,i)*h1
+       y4 = y4 + q(4,i)*h2
+#endif
+    enddo
+
+    x1 = x1 + q(1,nb+1)*hh(nb,1)
+    x2 = x2 + q(2,nb+1)*hh(nb,1)
+#ifndef PACK_REAL_TO_COMPLEX
+    x3 = x3 + q(3,nb+1)*hh(nb,1)
+    x4 = x4 + q(4,nb+1)*hh(nb,1)
+#endif
+
+    tau1 = hh(1,1)
+    tau2 = hh(1,2)
+
+    h1 = -tau1
+    x1 = x1*h1
+    x2 = x2*h1
+#ifndef PACK_REAL_TO_COMPLEX
+    x3 = x3*h1
+    x4 = x4*h1
+#endif
+    h1 = -tau2
+    h2 = -tau2*s
+    y1 = y1*h1 + x1*h2
+    y2 = y2*h1 + x2*h2
+#ifndef PACK_REAL_TO_COMPLEX
+    y3 = y3*h1 + x3*h2
+    y4 = y4*h1 + x4*h2
+#endif
+
+    q(1,1) = q(1,1) + y1
+    q(2,1) = q(2,1) + y2
+#ifndef PACK_REAL_TO_COMPLEX
+    q(3,1) = q(3,1) + y3
+    q(4,1) = q(4,1) + y4
+#endif
+    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
+    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
+#ifndef PACK_REAL_TO_COMPLEX
+    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
+    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
+#endif
+
+#ifdef DOUBLE_PRECISION_REAL
+#if defined(SSE_ALIGNED)
+    !DEC$ VECTOR ALIGNED
+#endif
+#endif
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       q(1,i) = q(1,i) + x1*h1 + y1*h2
+       q(2,i) = q(2,i) + x2*h1 + y2*h2
+#ifndef PACK_REAL_TO_COMPLEX
+       q(3,i) = q(3,i) + x3*h1 + y3*h2
+       q(4,i) = q(4,i) + x4*h1 + y4*h2
+#endif
+    enddo
+
+    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
+    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
+#ifndef PACK_REAL_TO_COMPLEX
+    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
+    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
+#endif
+
+!    call obj%timer%stop("kernel generic: hh_trafo_kernel_4_generic" // &
+!		     &PRECISION_SUFFIX &
+!		     )
+
+  end subroutine
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_vsx_2hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_vsx_2hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_vsx_2hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_vsx_2hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET VSX_SSE
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_vsx_2hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_vsx_2hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_vsx_2hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_vsx_2hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK2 1
+#define VEC_SET VSX_SSE
+#include "../../general/precision_macros.h"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
+#undef BLOCK2
+#undef VEC_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_vsx_4hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_vsx_4hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_vsx_4hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_vsx_4hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK4 1
+#define SIMD_SET VSX_SSE
+#include "../../general/precision_macros.h"
+#include "real_vsx_4hv_template.c"
+#undef BLOCK4
+#undef SIMD_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_vsx_4hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_vsx_4hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_vsx_4hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_vsx_4hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK4 1
+#define SIMD_SET VSX_SSE
+#include "../../general/precision_macros.h"
+#include "real_vsx_4hv_template.c"
+#undef BLOCK4
+#undef SIMD_SET
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_vsx_6hv_double_precision.c elpa-2019.11.001/src/elpa2/kernels/real_vsx_6hv_double_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_vsx_6hv_double_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_vsx_6hv_double_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define BLOCK6 1
+#define SIMD_SET VSX_SSE
+#include "../../general/precision_macros.h"
+#include "real_vsx_6hv_template.c"
+#undef BLOCK6
+#undef SIMD_SET
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/real_vsx_6hv_single_precision.c elpa-2019.11.001/src/elpa2/kernels/real_vsx_6hv_single_precision.c
--- elpa-2016.05.001/src/elpa2/kernels/real_vsx_6hv_single_precision.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/real_vsx_6hv_single_precision.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,59 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#define BLOCK6 1
+#define VEC_SET VSX_SSE
+#include "../../general/precision_macros.h"
+#include "real_vsx_6hv_template.c"
+#undef VEC_SET
+#undef BLOCK6
+#undef REALCASE
+#undef SINGLE_PRECISION
+
diff -Nru elpa-2016.05.001/src/elpa2/kernels/simple_block4_template.F90 elpa-2019.11.001/src/elpa2/kernels/simple_block4_template.F90
--- elpa-2016.05.001/src/elpa2/kernels/simple_block4_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/simple_block4_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,310 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! This is the small and simple version (no hand unrolling of loops etc.) but for some
+! compilers this performs better than a sophisticated version with transformed and unrolled loops.
+!
+! It should be compiled with the highest possible optimization level.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+#endif
+
+  subroutine quad_hh_trafo_&
+  &MATH_DATATYPE&
+  &_generic_simple_4hv_&
+  &PRECISION&
+  & (q, hh, nb, nq, ldq, ldh)
+
+    use precision
+    use elpa_abstract_impl
+    implicit none
+
+    !class(elpa_abstract_impl_t), intent(inout) :: obj
+    integer(kind=ik), intent(in)    :: nb, nq, ldq, ldh
+#if REALCASE==1
+
+#ifdef USE_ASSUMED_SIZE
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+3)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(1:ldh,1:6)
+#endif
+    real(kind=C_DATATYPE_KIND)                :: s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4
+    real(kind=C_DATATYPE_KIND)                :: vs_1_2, vs_1_3, vs_2_3, vs_1_4, vs_2_4, vs_3_4
+    real(kind=C_DATATYPE_KIND)                :: h_2_1, h_3_2, h_3_1, h_4_3, h_4_2, h_4_1
+    real(kind=C_DATATYPE_KIND)                :: a_1_1(nq), a_2_1(nq), a_3_1(nq), a_4_1(nq)
+    real(kind=C_DATATYPE_KIND)                :: h1, h2, h3, h4
+    real(kind=C_DATATYPE_KIND)                :: w(nq), z(nq), x(nq), y(nq)
+    real(kind=C_DATATYPE_KIND)                :: tau1, tau2, tau3, tau4
+#endif /* REALCASE==1 */
+
+#if COMPLEXCASE==1
+
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+    complex(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
+#else
+    complex(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+3)
+    complex(kind=C_DATATYPE_KIND), intent(in)    :: hh(1:ldh,1:6)
+#endif
+    complex(kind=C_DATATYPE_KIND)                :: s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4
+    complex(kind=C_DATATYPE_KIND)                :: vs_1_2, vs_1_3, vs_2_3, vs_1_4, vs_2_4, vs_3_4
+    complex(kind=C_DATATYPE_KIND)                :: h_2_1, h_3_2, h_3_1, h_4_3, h_4_2, h_4_1
+    complex(kind=C_DATATYPE_KIND)                :: a_1_1(nq), a_2_1(nq), a_3_1(nq), a_4_1(nq)
+    complex(kind=C_DATATYPE_KIND)                :: w(nq), z(nq), x(nq), y(nq)
+    complex(kind=C_DATATYPE_KIND)                :: h1, h2, h3, h4
+    complex(kind=C_DATATYPE_KIND)                :: tau1, tau2, tau3, tau4
+#endif /* COMPLEXCASE==1 */
+    integer(kind=ik)                             :: i
+    ! Calculate dot product of the two Householder vectors
+
+#if REALCASE==1
+    s_1_2 = hh(2,2)
+    s_1_3 = hh(3,3)
+    s_2_3 = hh(2,3) 
+    s_1_4 = hh(4,4)
+    s_2_4 = hh(3,4)
+    s_3_4 = hh(2,4)
+
+    s_1_2 = s_1_2 + hh(2,1) * hh(3,2)
+    s_2_3 = s_2_3 + hh(2,2) * hh(3,3)
+    s_3_4 = s_3_4 + hh(2,3) * hh(3,4)
+
+    s_1_2 = s_1_2 + hh(3,1) * hh(4,2)
+    s_2_3 = s_2_3 + hh(3,2) * hh(4,3)
+    s_3_4 = s_3_4 + hh(3,3) * hh(4,4)
+
+    s_1_3 = s_1_3 + hh(2,1) * hh(4,3)
+    s_2_4 = s_2_4 + hh(2,2) * hh(4,4)
+
+    !DIR$ IVDEP
+    do i=5,nb
+       s_1_2 = s_1_2 + hh(i-1,1) * hh(i,2)
+       s_2_3 = s_2_3 + hh(i-1,2) * hh(i,3)
+       s_3_4 = s_3_4 + hh(i-1,3) * hh(i,4)
+
+       s_1_3 = s_1_3 + hh(i-2,1) * hh(i,3)
+       s_2_4 = s_2_4 + hh(i-2,2) * hh(i,4)
+
+       s_1_4 = s_1_4 + hh(i-3,1) * hh(i,4)
+    enddo
+#endif
+
+#if COMPLEXCASE==1
+    stop
+    !s = conjg(hh(2,2))*1.0
+    !do i=3,nb
+    !   s = s+(conjg(hh(i,2))*hh(i-1,1))
+    !enddo
+#endif
+
+    ! Do the Householder transformations
+    a_1_1(1:nq) = q(1:nq,4)
+    a_2_1(1:nq) = q(1:nq,3)
+    a_3_1(1:nq) = q(1:nq,2)
+    a_4_1(1:nq) = q(1:nq,1)
+
+    h_2_1 = hh(2,2)
+    h_3_2 = hh(2,3)
+    h_3_1 = hh(3,3)
+    h_4_3 = hh(2,4)
+    h_4_2 = hh(3,4)
+    h_4_1 = hh(4,4)
+
+#if REALCASE == 1
+    w(1:nq) = a_3_1(1:nq) * h_4_3 + a_4_1(1:nq)
+    w(1:nq) = a_2_1(1:nq) * h_4_2 +     w(1:nq)
+    w(1:nq) = a_1_1(1:nq) * h_4_1 +     w(1:nq)
+
+    z(1:nq) = a_2_1(1:nq) * h_3_2 + a_3_1(1:nq)
+    z(1:nq) = a_1_1(1:nq) * h_3_1 +     z(1:nq)
+
+    y(1:nq) = a_1_1(1:nq) * h_2_1 + a_2_1(1:nq)
+
+    x(1:nq) = a_1_1(1:nq)
+#endif
+
+#if COMPLEXCASE==1
+    stop
+    !y(1:nq) = q(1:nq,1) + q(1:nq,2)*conjg(hh(2,2))
+#endif
+
+    do i=5,nb
+#if REALCASE == 1
+      h1 = hh(i-3,1)
+      h2 = hh(i-2,2)
+      h3 = hh(i-1,3)
+      h4 = hh(i  ,4)
+#endif
+#if COMPLEXCASE==1
+       stop
+    !   h1 = conjg(hh(i-1,1))
+    !   h2 = conjg(hh(i,2))
+#endif
+
+      x(1:nq) = x(1:nq) + q(1:nq,i) * h1
+      y(1:nq) = y(1:nq) + q(1:nq,i) * h2
+      z(1:nq) = z(1:nq) + q(1:nq,i) * h3
+      w(1:nq) = w(1:nq) + q(1:nq,i) * h4
+    enddo
+
+    h1 = hh(nb-2,1)
+    h2 = hh(nb-1,2)
+    h3 = hh(nb  ,3)
+
+#if REALCASE==1
+    x(1:nq) = x(1:nq) + q(1:nq,nb+1) * h1 
+    y(1:nq) = y(1:nq) + q(1:nq,nb+1) * h2
+    z(1:nq) = z(1:nq) + q(1:nq,nb+1) * h3
+#endif
+
+#if COMPLEXCASE==1
+    stop
+    !x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
+#endif
+
+    h1 = hh(nb-1,1)
+    h2 = hh(nb  ,2)
+
+    x(1:nq) = x(1:nq) + q(1:nq,nb+2) * h1
+    y(1:nq) = y(1:nq) + q(1:nq,nb+2) * h2
+
+    h1 = hh(nb,1)
+
+    x(1:nq) = x(1:nq) + q(1:nq,nb+3) * h1
+
+
+    ! Rank-1 update
+    tau1 = hh(1,1)
+    tau2 = hh(1,2)
+    tau3 = hh(1,3)
+    tau4 = hh(1,4)
+
+    vs_1_2 = s_1_2
+    vs_1_3 = s_1_3
+    vs_2_3 = s_2_3
+    vs_1_4 = s_1_4
+    vs_2_4 = s_2_4
+    vs_3_4 = s_3_4
+
+    h1 = tau1
+    x(1:nq) = x(1:nq) * h1
+
+    h1 = tau2
+    h2 = tau2 * vs_1_2
+    y(1:nq) = y(1:nq) * h1 - x(1:nq) * h2
+
+    h1 = tau3
+    h2 = tau3 * vs_1_3
+    h3 = tau3 * vs_2_3
+    z(1:nq) = z(1:nq) * h1  - (y(1:nq) * h3 + x(1:nq) * h2)
+
+    h1 = tau4
+    h2 = tau4 * vs_1_4
+    h3 = tau4 * vs_2_4
+    h4 = tau4 * vs_3_4
+
+    w(1:nq) = w(1:nq) * h1 - ( z(1:nq) * h4 + y(1:nq) * h3 + x(1:nq) * h2)
+
+    q(1:nq,1) = q(1:nq,1) - w(1:nq)
+
+    h4 = hh(2,4)
+
+    q(1:nq,2) = q(1:nq,2) - (w(1:nq) * h4 + z(1:nq))
+
+    h3 = hh(2,3)
+    h4 = hh(3,4)
+
+    q(1:nq,3) = q(1:nq,3) - y(1:nq)
+    q(1:nq,3) = -( z(1:nq) * h3) + q(1:nq,3)
+    q(1:nq,3) = -( w(1:nq) * h4) + q(1:nq,3)
+
+    h2 = hh(2,2)
+    h3 = hh(3,3)
+    h4 = hh(4,4)
+
+    q(1:nq,4) =  q(1:nq,4) - x(1:nq)
+    q(1:nq,4) = -(y(1:nq) * h2) + q(1:nq,4)
+    q(1:nq,4) = -(z(1:nq) * h3) + q(1:nq,4)
+    q(1:nq,4) = -(w(1:nq) * h4) + q(1:nq,4)
+
+    do i=5,nb
+       h1 = hh(i-3,1)
+       h2 = hh(i-2,2)
+       h3 = hh(i-1,3)
+       h4 = hh(i  ,4)
+
+       q(1:nq,i) = -(x(1:nq) * h1) + q(1:nq,i)
+       q(1:nq,i) = -(y(1:nq) * h2) + q(1:nq,i)
+       q(1:nq,i) = -(z(1:nq) * h3) + q(1:nq,i)
+       q(1:nq,i) = -(w(1:nq) * h4) + q(1:nq,i)
+   enddo
+
+   h1 = hh(nb-2,1)
+   h2 = hh(nb-1,2)
+   h3 = hh(nb  ,3)
+
+   q(1:nq,nb+1) = -(x(1:nq) * h1) + q(1:nq,nb+1)
+   q(1:nq,nb+1) = -(y(1:nq) * h2) + q(1:nq,nb+1)
+   q(1:nq,nb+1) = -(z(1:nq) * h3) + q(1:nq,nb+1)
+
+   h1 = hh(nb-1,1)
+   h2 = hh(nb  ,2)
+
+   q(1:nq,nb+2) = - (x(1:nq) * h1) + q(1:nq,nb+2)
+   q(1:nq,nb+2) = - (y(1:nq) * h2) + q(1:nq,nb+2)
+
+   h1 = hh(nb,1)
+   q(1:nq,nb+3) = - (x(1:nq) * h1) + q(1:nq,nb+3)
+
+  end subroutine
diff -Nru elpa-2016.05.001/src/elpa2/kernels/simple_block6_template.F90 elpa-2019.11.001/src/elpa2/kernels/simple_block6_template.F90
--- elpa-2016.05.001/src/elpa2/kernels/simple_block6_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/simple_block6_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,438 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! This is the small and simple version (no hand unrolling of loops etc.) but for some
+! compilers this performs better than a sophisticated version with transformed and unrolled loops.
+!
+! It should be compiled with the highest possible optimization level.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: A. Marek, MPCDF
+! --------------------------------------------------------------------------------------------------
+#endif
+
+  subroutine hexa_hh_trafo_&
+  &MATH_DATATYPE&
+  &_generic_simple_6hv_&
+  &PRECISION&
+  & (q, hh, nb, nq, ldq, ldh)
+
+    use precision
+    use elpa_abstract_impl
+    implicit none
+
+    integer(kind=ik), intent(in)              :: nb, nq, ldq, ldh
+#if REALCASE==1
+
+#ifdef USE_ASSUMED_SIZE
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+5)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(1:ldh,1:6)
+#endif
+    real(kind=C_DATATYPE_KIND)                :: scalarproduct(15)
+    real(kind=C_DATATYPE_KIND)                :: vs_1_2, vs_1_3, vs_2_3, vs_1_4, vs_2_4, vs_3_4
+    real(kind=C_DATATYPE_KIND)                :: vs_1_5, vs_1_6, vs_2_5, vs_2_6, vs_3_5
+    real(kind=C_DATATYPE_KIND)                :: vs_3_6, vs_4_5, vs_4_6, vs_5_6
+    real(kind=C_DATATYPE_KIND)                :: a_1_1(nq), a_2_1(nq), a_3_1(nq), a_4_1(nq), a_5_1(nq), a_6_1(nq)
+    real(kind=C_DATATYPE_KIND)                :: h_6_5, h_6_4, h_6_3, h_6_2, h_6_1
+    real(kind=C_DATATYPE_KIND)                :: h_5_4, h_5_3, h_5_2, h_5_1
+    real(kind=C_DATATYPE_KIND)                :: h_4_3, h_4_2, h_4_1
+    real(kind=C_DATATYPE_KIND)                :: h_2_1, h_3_2, h_3_1
+    real(kind=C_DATATYPE_KIND)                :: h1, h2, h3, h4, h5, h6
+    real(kind=C_DATATYPE_KIND)                :: w(nq), z(nq), x(nq), y(nq), t(nq), v(nq)
+    real(kind=C_DATATYPE_KIND)                :: tau1, tau2, tau3, tau4, tau5, tau6
+#endif /* REALCASE==1 */
+
+
+    integer(kind=ik)                             :: i, j
+    ! Calculate dot product of the two Householder vectors
+
+    scalarproduct(1)  = hh(2,2)
+    scalarproduct(2)  = hh(3,3)
+    scalarproduct(3)  = hh(2,3)
+    scalarproduct(4)  = hh(4,4)
+    scalarproduct(5)  = hh(3,4)
+    scalarproduct(6)  = hh(2,4)
+    scalarproduct(7)  = hh(5,5)
+    scalarproduct(8)  = hh(4,5)
+    scalarproduct(9)  = hh(3,5)
+    scalarproduct(10) = hh(2,5)
+    scalarproduct(11) = hh(6,6)
+    scalarproduct(12) = hh(5,6)
+    scalarproduct(13) = hh(4,6)
+    scalarproduct(14) = hh(3,6)
+    scalarproduct(15) = hh(2,6)
+
+    scalarproduct(1)  = scalarproduct(1)  + hh(2,1) * hh(3,2)
+    scalarproduct(3)  = scalarproduct(3)  + hh(2,2) * hh(3,3)
+    scalarproduct(6)  = scalarproduct(6)  + hh(2,3) * hh(3,4)
+    scalarproduct(10) = scalarproduct(10) + hh(2,4) * hh(3,5)
+    scalarproduct(15) = scalarproduct(15) + hh(2,5) * hh(3,6)
+
+    scalarproduct(1)  = scalarproduct(1)  + hh(3,1) * hh(4,2)
+    scalarproduct(3)  = scalarproduct(3)  + hh(3,2) * hh(4,3)
+    scalarproduct(6)  = scalarproduct(6)  + hh(3,3) * hh(4,4)
+    scalarproduct(10) = scalarproduct(10) + hh(3,4) * hh(4,5)
+    scalarproduct(15) = scalarproduct(15) + hh(3,5) * hh(4,6)
+
+    scalarproduct(2)  = scalarproduct(2)  + hh(2,1) * hh(4,3)
+    scalarproduct(5)  = scalarproduct(5)  + hh(2,2) * hh(4,4)
+    scalarproduct(9)  = scalarproduct(9)  + hh(2,3) * hh(4,5)
+    scalarproduct(14) = scalarproduct(14) + hh(2,4) * hh(4,6)
+
+    scalarproduct(1)  = scalarproduct(1)  + hh(4,1) * hh(5,2)
+    scalarproduct(3)  = scalarproduct(3)  + hh(4,2) * hh(5,3)
+    scalarproduct(6)  = scalarproduct(6)  + hh(4,3) * hh(5,4)
+    scalarproduct(10) = scalarproduct(10) + hh(4,4) * hh(5,5)
+    scalarproduct(15) = scalarproduct(15) + hh(4,5) * hh(5,6)
+
+    scalarproduct(2)  = scalarproduct(2)  + hh(3,1) * hh(5,3)
+    scalarproduct(5)  = scalarproduct(5)  + hh(3,2) * hh(5,4)
+    scalarproduct(9)  = scalarproduct(9)  + hh(3,3) * hh(5,5)
+    scalarproduct(14) = scalarproduct(14) + hh(3,4) * hh(5,6)
+
+    scalarproduct(4)  = scalarproduct(4)  + hh(2,1) * hh(5,4)
+    scalarproduct(8)  = scalarproduct(8)  + hh(2,2) * hh(5,5)
+    scalarproduct(13) = scalarproduct(13) + hh(2,3) * hh(5,6)
+
+    scalarproduct(1)  = scalarproduct(1)  + hh(5,1) * hh(6,2)
+    scalarproduct(3)  = scalarproduct(3)  + hh(5,2) * hh(6,3)
+    scalarproduct(6)  = scalarproduct(6)  + hh(5,3) * hh(6,4)
+    scalarproduct(10) = scalarproduct(10) + hh(5,4) * hh(6,5)
+    scalarproduct(15) = scalarproduct(15) + hh(5,5) * hh(6,6)
+
+    scalarproduct(2)  = scalarproduct(2)  + hh(4,1) * hh(6,3)
+    scalarproduct(5)  = scalarproduct(5)  + hh(4,2) * hh(6,4)
+    scalarproduct(9)  = scalarproduct(9)  + hh(4,3) * hh(6,5)
+    scalarproduct(14) = scalarproduct(14) + hh(4,4) * hh(6,6)
+
+    scalarproduct(4)  = scalarproduct(4)  + hh(3,1) * hh(6,4)
+    scalarproduct(8)  = scalarproduct(8)  + hh(3,2) * hh(6,5)
+    scalarproduct(13) = scalarproduct(13) + hh(3,3) * hh(6,6)
+
+    scalarproduct(7)  = scalarproduct(7)  + hh(2,1) * hh(6,5)
+    scalarproduct(12) = scalarproduct(12) + hh(2,2) * hh(6,6)
+
+    !DIR$ IVDEP
+    do i=7,nb
+       scalarproduct(1)  = scalarproduct(1)  + hh(i-1,1) * hh(i,2)
+       scalarproduct(3)  = scalarproduct(3)  + hh(i-1,2) * hh(i,3)
+       scalarproduct(6)  = scalarproduct(6)  + hh(i-1,3) * hh(i,4)
+       scalarproduct(10) = scalarproduct(10) + hh(i-1,4) * hh(i,5)
+       scalarproduct(15) = scalarproduct(15) + hh(i-1,5) * hh(i,6)
+
+       scalarproduct(2)  = scalarproduct(2)  + hh(i-2,1) * hh(i,3)
+       scalarproduct(5)  = scalarproduct(5)  + hh(i-2,2) * hh(i,4)
+       scalarproduct(9)  = scalarproduct(9)  + hh(i-2,3) * hh(i,5)
+       scalarproduct(14) = scalarproduct(14) + hh(i-2,4) * hh(i,6)
+
+       scalarproduct(4)  = scalarproduct(4)  + hh(i-3,1) * hh(i,4)
+       scalarproduct(8)  = scalarproduct(8)  + hh(i-3,2) * hh(i,5)
+       scalarproduct(13) = scalarproduct(13) + hh(i-3,3) * hh(i,6)
+
+       scalarproduct(7)  = scalarproduct(7)  + hh(i-4,1) * hh(i,5)
+       scalarproduct(12) = scalarproduct(12) + hh(i-4,2) * hh(i,6)
+
+       scalarproduct(11) = scalarproduct(11) + hh(i-5,1) * hh(i,6)
+    enddo
+
+#if COMPLEXCASE==1
+    stop
+    !s = conjg(hh(2,2))*1.0
+    !do i=3,nb
+    !   s = s+(conjg(hh(i,2))*hh(i-1,1))
+    !enddo
+#endif
+
+    ! Do the Householder transformations
+    a_1_1(1:nq) = q(1:nq,6)
+    a_2_1(1:nq) = q(1:nq,5)
+    a_3_1(1:nq) = q(1:nq,4)
+    a_4_1(1:nq) = q(1:nq,3)
+    a_5_1(1:nq) = q(1:nq,2)
+    a_6_1(1:nq) = q(1:nq,1)
+
+    h_6_5 = hh(2,6)
+    h_6_4 = hh(3,6)
+    h_6_3 = hh(4,6)
+    h_6_2 = hh(5,6)
+    h_6_1 = hh(6,6)
+
+    t(1:nq) = a_6_1(1:nq) + a_5_1(1:nq) * h_6_5 + a_4_1(1:nq) * h_6_4 + a_3_1(1:nq) * h_6_3 + a_2_1(1:nq) * h_6_2 + &
+                            a_1_1(1:nq) * h_6_1
+
+    h_5_4 = hh(2,5)
+    h_5_3 = hh(3,5)
+    h_5_2 = hh(4,5)
+    h_5_1 = hh(5,5)
+
+    v(1:nq) = a_5_1(1:nq) + a_4_1(1:nq) * h_5_4 + a_3_1(1:nq) * h_5_3 + a_2_1(1:nq) * h_5_2 + a_1_1(1:nq) * h_5_1
+
+    h_4_3 = hh(2,4)
+    h_4_2 = hh(3,4)
+    h_4_1 = hh(4,4)
+
+    w(1:nq) = a_4_1(1:nq) + a_3_1(1:nq) * h_4_3 + a_2_1(1:nq) * h_4_2 + a_1_1(1:nq) * h_4_1
+
+    h_2_1 = hh(2,2)
+    h_3_2 = hh(2,3)
+    h_3_1 = hh(3,3)
+
+    z(1:nq) = a_3_1(1:nq) + a_2_1(1:nq) * h_3_2 + a_1_1(1:nq) * h_3_1
+
+    y(1:nq) = a_2_1(1:nq) + a_1_1(1:nq) * h_2_1
+
+    x(1:nq) = a_1_1(1:nq)
+
+    do i=7,nb
+      h1 = hh(i-5,1) !
+      h2 = hh(i-4,2) !
+      h3 = hh(i-3,3) !
+      h4 = hh(i-2,4) !
+      h5 = hh(i-1,5) !
+      h6 = hh(i  ,6) !
+#if COMPLEXCASE==1
+       stop
+    !   h1 = conjg(hh(i-1,1))
+    !   h2 = conjg(hh(i,2))
+#endif
+
+      x(1:nq) = x(1:nq) + q(1:nq,i) * h1
+      y(1:nq) = y(1:nq) + q(1:nq,i) * h2
+      z(1:nq) = z(1:nq) + q(1:nq,i) * h3
+      w(1:nq) = w(1:nq) + q(1:nq,i) * h4
+      v(1:nq) = v(1:nq) + q(1:nq,i) * h5
+      t(1:nq) = t(1:nq) + q(1:nq,i) * h6
+    enddo
+
+    h1 = hh(nb-4,1)
+    h2 = hh(nb-3,2)
+    h3 = hh(nb-2,3)
+    h4 = hh(nb-1,4)
+    h5 = hh(nb  ,5)
+
+    x(1:nq) = x(1:nq) + q(1:nq,nb+1) * h1
+    y(1:nq) = y(1:nq) + q(1:nq,nb+1) * h2
+    z(1:nq) = z(1:nq) + q(1:nq,nb+1) * h3
+    w(1:nq) = w(1:nq) + q(1:nq,nb+1) * h4
+    v(1:nq) = v(1:nq) + q(1:nq,nb+1) * h5
+
+#if COMPLEXCASE==1
+    stop
+    !x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
+#endif
+
+    h1 = hh(nb-3,1)
+    h2 = hh(nb-2,2)
+    h3 = hh(nb-1,3)
+    h4 = hh(nb  ,4)
+
+    x(1:nq) = x(1:nq) + q(1:nq,nb+2) * h1
+    y(1:nq) = y(1:nq) + q(1:nq,nb+2) * h2
+    z(1:nq) = z(1:nq) + q(1:nq,nb+2) * h3
+    w(1:nq) = w(1:nq) + q(1:nq,nb+2) * h4
+
+    h1 = hh(nb-2,1)
+    h2 = hh(nb-1,2)
+    h3 = hh(nb  ,3)
+
+    x(1:nq) = x(1:nq) + q(1:nq,nb+3) * h1
+    y(1:nq) = y(1:nq) + q(1:nq,nb+3) * h2
+    z(1:nq) = z(1:nq) + q(1:nq,nb+3) * h3
+
+    h1 = hh(nb-1,1)
+    h2 = hh(nb  ,2)
+
+    x(1:nq) = x(1:nq)  + q(1:nq,nb+4) * h1
+    y(1:nq) = y(1:nq)  + q(1:nq,nb+4) * h2
+
+    h1 = hh(nb,1)
+
+    x(1:nq) = x(1:nq) + q(1:nq,nb+5) * h1
+ 
+    ! Rank-1 update
+    tau1 = hh(1,1)
+    x(1:nq) = x(1:nq) * tau1
+
+    tau2 = hh(1,2)
+    vs_1_2 = scalarproduct(1)
+
+    h2 = tau2 * vs_1_2 !
+    y(1:nq) = y(1:nq) * tau2 - (x(1:nq) * h2)
+
+    tau3 = hh(1,3)
+    vs_1_3 = scalarproduct(2)
+    vs_2_3 = scalarproduct(3)
+
+    h2 = tau3 * vs_1_3
+    h3 = tau3 * vs_2_3
+    z(1:nq) = z(1:nq) * tau3  - (y(1:nq) * h3 + x(1:nq) * h2)
+ 
+    tau4 = hh(1,4)
+    vs_1_4 = scalarproduct(4)
+    vs_2_4 = scalarproduct(5)
+
+    h2 = tau4 * vs_1_4
+    h3 = tau4 * vs_2_4
+
+    vs_3_4 = scalarproduct(6)
+
+    h4 = tau4 * vs_3_4
+
+    w(1:nq) = w(1:nq) * tau4 - ( z(1:nq) * h4 + y(1:nq) * h3 + x(1:nq) * h2)
+
+    tau5 = hh(1,5)
+    vs_1_5 = scalarproduct(7)
+    vs_2_5 = scalarproduct(8)
+
+    h2 = tau5 * vs_1_5
+    h3 = tau5 * vs_2_5
+
+    vs_3_5 = scalarproduct(9)
+    vs_4_5 = scalarproduct(10)
+
+    h4 = tau5 * vs_3_5
+    h5 = tau5 * vs_4_5
+
+    v(1:nq) = v(1:nq) * tau5 - ( w(1:nq) * h5 + z(1:nq) * h4 + y(1:nq) * h3 + x(1:nq) * h2)
+
+    tau6 = hh(1,6)
+    vs_1_6 = scalarproduct(11)
+    vs_2_6 = scalarproduct(12)
+
+    h2 = tau6 * vs_1_6
+    h3 = tau6 * vs_2_6
+
+    vs_3_6 = scalarproduct(13)
+    vs_4_6 = scalarproduct(14)
+    vs_5_6 = scalarproduct(15)
+
+    h4 = tau6 * vs_3_6
+    h5 = tau6 * vs_4_6
+    h6 = tau6 * vs_5_6
+
+    t(1:nq) = t(1:nq) * tau6 - ( v(1:nq) * h6 + w(1:nq) * h5 + z(1:nq) * h4 + y(1:nq) * h3 + x(1:nq) * h2)
+
+    q(1:nq,1) = q(1:nq,1) - t(1:nq)
+
+    h6 = hh(2,6)
+
+    q(1:nq,2) = q(1:nq,2) - (v(1:nq) + t(1:nq) * h6) 
+
+    h5 = hh(2,5)
+    h6 = hh(3,6)
+
+    q(1:nq,3) = q(1:nq,3) - (w(1:nq) + v(1:nq) * h5 + t(1:nq) * h6) 
+
+    h4 = hh(2,4)
+    h5 = hh(3,5)
+    h6 = hh(4,6)
+
+    q(1:nq,4) = q(1:nq,4) - (z(1:nq) + w(1:nq) * h4 + v(1:nq) * h5 + t(1:nq) * h6)
+
+    h3 = hh(2,3)
+    h4 = hh(3,4)
+    h5 = hh(4,5)
+    h6 = hh(5,6)
+
+    q(1:nq,5) =  q(1:nq,5) - (y(1:nq) + z(1:nq) * h3 + w(1:nq) * h4 + v(1:nq) * h5 + t(1:nq) * h6)
+
+    h2 = hh(2,2)
+    h3 = hh(3,3)
+    h4 = hh(4,4)
+    h5 = hh(5,5)
+    h6 = hh(6,6)
+
+    q(1:nq,6) = q(1:nq,6) - (x(1:nq) + y(1:nq) * h2 + z(1:nq) * h3 + w(1:nq) * h4 + v(1:nq) * h5 + t(1:nq) * h6)
+
+    do i=7,nb
+       h1 = hh(i-5,1)
+       h2 = hh(i-4,2)
+       h3 = hh(i-3,3)
+       h4 = hh(i-2,4)
+       h5 = hh(i-1,5)
+       h6 = hh(i  ,6)
+
+       q(1:nq,i) = q(1:nq,i) -(x(1:nq) * h1 + y(1:nq) * h2 + z(1:nq) * h3 + w(1:nq) * h4 + v(1:nq) * h5 + t(1:nq) * h6)
+   enddo
+
+   h1 = hh(nb-4,1)
+   h2 = hh(nb-3,2)
+   h3 = hh(nb-2,3)
+   h4 = hh(nb-1,4)
+   h5 = hh(nb  ,5)
+
+   q(1:nq,nb+1) = q(1:nq,nb+1) -(x(1:nq) * h1 + y(1:nq) * h2 + z(1:nq) * h3 + w(1:nq) * h4 + v(1:nq) * h5)
+
+   h1 = hh(nb-3,1)
+   h2 = hh(nb-2,2)
+   h3 = hh(nb-1,3)
+   h4 = hh(nb  ,4)
+
+   q(1:nq,nb+2) = q(1:nq,nb+2) - (x(1:nq) * h1 + y(1:nq) * h2 + z(1:nq) * h3 + w(1:nq) * h4)
+
+   h1 = hh(nb-2,1)
+   h2 = hh(nb-1,2)
+   h3 = hh(nb  ,3)
+
+   q(1:nq,nb+3) = q(1:nq,nb+3) - (x(1:nq) * h1 + y(1:nq) * h2 + z(1:nq) * h3)
+   h1 = hh(nb-1,1)
+   h2 = hh(nb  ,2)
+
+   q(1:nq,nb+4) = q(1:nq,nb+4) - (x(1:nq) * h1 +y(1:nq) * h2)
+
+   h1 = hh(nb ,1)
+
+   q(1:nq,nb+5) = q(1:nq,nb+5) - (x(1:nq) * h1)
+
+  end subroutine
diff -Nru elpa-2016.05.001/src/elpa2/kernels/simple_template.F90 elpa-2019.11.001/src/elpa2/kernels/simple_template.F90
--- elpa-2016.05.001/src/elpa2/kernels/simple_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/kernels/simple_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,246 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! --------------------------------------------------------------------------------------------------
+!
+! This file contains the compute intensive kernels for the Householder transformations.
+!
+! This is the small and simple version (no hand unrolling of loops etc.) but for some
+! compilers this performs better than a sophisticated version with transformed and unrolled loops.
+!
+! It should be compiled with the highest possible optimization level.
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! --------------------------------------------------------------------------------------------------
+#endif
+
+#if COMPLEXCASE==1
+  ! the intel compiler creates a temp copy of array q
+  ! this should be avoided without using assumed size arrays
+
+  subroutine single_hh_trafo_&
+  &MATH_DATATYPE&
+  &_generic_simple_&
+  &PRECISION&
+  & (q, hh, nb, nq, ldq)
+
+    use precision
+    use elpa_abstract_impl
+    implicit none
+    !class(elpa_abstract_impl_t), intent(inout) :: obj
+    integer(kind=ik), intent(in)    :: nb, nq, ldq
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+    complex(kind=C_DATATYPE_KIND), intent(in)    :: hh(*)
+#else
+    complex(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb)
+    complex(kind=C_DATATYPE_KIND), intent(in)    :: hh(1:nb)
+#endif
+    integer(kind=ik)                :: i
+    complex(kind=C_DATATYPE_KIND)                :: tau1, x(nq)
+
+    !call obj%timer%start("kernel_&
+    !&MATH_DATATYPE&
+    !&_generic_simple: single_hh_trafo_&
+    !&MATH_DATATYPE&
+    !&_generic_simple" // &
+    !&PRECISION_SUFFIX &
+    !)
+
+    ! Just one Householder transformation
+
+    x(1:nq) = q(1:nq,1)
+
+    do i=2,nb
+       x(1:nq) = x(1:nq) + q(1:nq,i)*conjg(hh(i))
+    enddo
+
+    tau1 = hh(1)
+    x(1:nq) = x(1:nq)*(-tau1)
+
+    q(1:nq,1) = q(1:nq,1) + x(1:nq)
+
+    do i=2,nb
+       q(1:nq,i) = q(1:nq,i) + x(1:nq)*hh(i)
+    enddo
+
+
+    !call obj%timer%stop("kernel_&
+    !&MATH_DATATYPE&
+    !&_generic_simple: single_hh_trafo_&
+    !&MATH_DATATYPE&
+    !&_generic_simple" // &
+    !&PRECISION_SUFFIX &
+    !)
+
+  end subroutine
+
+#endif /* COMPLEXCASE == 1 */
+  ! --------------------------------------------------------------------------------------------------
+
+  subroutine double_hh_trafo_&
+  &MATH_DATATYPE&
+  &_generic_simple_&
+  &PRECISION&
+  & (q, hh, nb, nq, ldq, ldh)
+
+    use precision
+    use elpa_abstract_impl
+    implicit none
+
+    !class(elpa_abstract_impl_t), intent(inout) :: obj
+    integer(kind=ik), intent(in)    :: nb, nq, ldq, ldh
+#if REALCASE==1
+
+#ifdef USE_ASSUMED_SIZE
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
+#else
+    real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+1)
+    real(kind=C_DATATYPE_KIND), intent(in)    :: hh(1:ldh,1:6)
+#endif
+    real(kind=C_DATATYPE_KIND)                :: s, h1, h2, tau1, tau2, x(nq), y(nq)
+#endif /* REALCASE==1 */
+
+#if COMPLEXCASE==1
+
+#ifdef USE_ASSUMED_SIZE
+    complex(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
+    complex(kind=C_DATATYPE_KIND), intent(in)    :: hh(ldh,*)
+#else
+    complex(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+1)
+    complex(kind=C_DATATYPE_KIND), intent(in)    :: hh(1:ldh,1:2)
+#endif
+    complex(kind=C_DATATYPE_KIND)                :: s, h1, h2, tau1, tau2, x(nq), y(nq)
+#endif /* COMPLEXCASE==1 */
+    integer(kind=ik)                :: i
+
+    !call obj%timer%start("kernel_&
+    !&MATH_DATATYPE&
+    !&_generic_simple: double_hh_trafo_&
+    !&MATH_DATATYPE&
+    !&_generic_simple" // &
+    !&PRECISION_SUFFIX &
+    !)
+
+    ! Calculate dot product of the two Householder vectors
+#if REALCASE==1
+    s = hh(2,2)*1.0
+    do i=3,nb
+       s = s+hh(i,2)*hh(i-1,1)
+    enddo
+#endif
+
+#if COMPLEXCASE==1
+    s = conjg(hh(2,2))*1.0
+    do i=3,nb
+       s = s+(conjg(hh(i,2))*hh(i-1,1))
+    enddo
+#endif
+
+    ! Do the Householder transformations
+
+    x(1:nq) = q(1:nq,2)
+#if REALCASE==1
+    y(1:nq) = q(1:nq,1) + q(1:nq,2)*hh(2,2)
+#endif
+
+#if COMPLEXCASE==1
+    y(1:nq) = q(1:nq,1) + q(1:nq,2)*conjg(hh(2,2))
+#endif
+
+    do i=3,nb
+#if REALCASE==1
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+#endif
+
+#if COMPLEXCASE==1
+       h1 = conjg(hh(i-1,1))
+       h2 = conjg(hh(i,2))
+#endif
+       x(1:nq) = x(1:nq) + q(1:nq,i)*h1
+       y(1:nq) = y(1:nq) + q(1:nq,i)*h2
+    enddo
+
+#if REALCASE==1
+    x(1:nq) = x(1:nq) + q(1:nq,nb+1)*hh(nb,1)
+#endif
+
+#if COMPLEXCASE==1
+    x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
+#endif
+    tau1 = hh(1,1)
+    tau2 = hh(1,2)
+
+    h1 = -tau1
+    x(1:nq) = x(1:nq)*h1
+    h1 = -tau2
+    h2 = -tau2*s
+    y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
+
+    q(1:nq,1) = q(1:nq,1) + y(1:nq)
+    q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
+
+    do i=3,nb
+       h1 = hh(i-1,1)
+       h2 = hh(i,2)
+       q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
+    enddo
+
+    q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
+
+
+    !call obj%timer%stop("kernel_&
+    !&MATH_DATATYPE&
+    !&_generic_simple: double_hh_trafo_&
+    !&MATH_DATATYPE&
+    !&_generic_simple" // &
+    !&PRECISION_SUFFIX &
+    !)
+
+  end subroutine
diff -Nru elpa-2016.05.001/src/elpa2/mod_compute_hh_trafo.F90 elpa-2019.11.001/src/elpa2/mod_compute_hh_trafo.F90
--- elpa-2016.05.001/src/elpa2/mod_compute_hh_trafo.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/mod_compute_hh_trafo.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,134 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+module compute_hh_trafo
+#include "config-f90.h"
+  use elpa_mpi
+  implicit none
+
+#ifdef WITH_OPENMP
+  public compute_hh_trafo_real_openmp_double
+#else
+  public compute_hh_trafo_real_double
+#endif
+
+#ifdef WITH_OPENMP
+  public compute_hh_trafo_complex_openmp_double
+#else
+  public compute_hh_trafo_complex_double
+#endif
+
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#ifdef WITH_OPENMP
+  public compute_hh_trafo_real_openmp_single
+#else
+  public compute_hh_trafo_real_single
+#endif
+#endif
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#ifdef WITH_OPENMP
+  public compute_hh_trafo_complex_openmp_single
+#else
+  public compute_hh_trafo_complex_single
+#endif
+#endif
+  contains
+
+  !real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "compute_hh_trafo.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+ ! real single precision
+#if defined(WANT_SINGLE_PRECISION_REAL)
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "compute_hh_trafo.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+  !complex double precision
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "compute_hh_trafo.F90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+ ! complex single precision
+#if defined(WANT_SINGLE_PRECISION_COMPLEX)
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "compute_hh_trafo.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#endif
+
+!
+!  !complex double precision
+!#define COMPLEXCASE 1
+!#define DOUBLE_PRECISION 1
+!#include "../general/precision_macros.h"
+!#include "compute_hh_trafo_complex_gpu.F90"
+!#undef COMPLEXCASE
+!#undef DOUBLE_PRECISION
+!
+! ! complex single precision
+!#if defined(WANT_SINGLE_PRECISION_COMPLEX)
+!#define COMPLEXCASE 1
+!#define SINGLE_PRECISION 1
+!#include "../general/precision_macros.h"
+!#include "compute_hh_trafo_complex_gpu.F90"
+!#undef COMPLEXCASE
+!#undef SINGLE_PRECISION
+!#endif
+!
+end module
diff -Nru elpa-2016.05.001/src/elpa2/mod_pack_unpack_cpu.F90 elpa-2019.11.001/src/elpa2/mod_pack_unpack_cpu.F90
--- elpa-2016.05.001/src/elpa2/mod_pack_unpack_cpu.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/mod_pack_unpack_cpu.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,109 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+module pack_unpack_cpu
+#include "config-f90.h"
+  implicit none
+
+  private
+
+#ifdef WITH_OPENMP
+  public pack_row_real_cpu_openmp_double, unpack_row_real_cpu_openmp_double
+  public pack_row_complex_cpu_openmp_double, unpack_row_complex_cpu_openmp_double
+#else
+  public pack_row_real_cpu_double, unpack_row_real_cpu_double
+  public pack_row_complex_cpu_double, unpack_row_complex_cpu_double
+#endif
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+#ifdef WITH_OPENMP
+  public pack_row_real_cpu_openmp_single, unpack_row_real_cpu_openmp_single
+  public pack_row_complex_cpu_openmp_single,  unpack_row_complex_cpu_openmp_single
+#else
+  public pack_row_real_cpu_single, unpack_row_real_cpu_single
+  public pack_row_complex_cpu_single, unpack_row_complex_cpu_single
+#endif
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+  contains
+
+  !real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "pack_unpack_cpu.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+ ! real single precision
+#if defined(WANT_SINGLE_PRECISION_REAL)
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "pack_unpack_cpu.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+  !complex double precision
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "pack_unpack_cpu.F90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+ ! complex single precision
+#if defined(WANT_SINGLE_PRECISION_COMPLEX)
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "pack_unpack_cpu.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#endif
+
+
+end module
diff -Nru elpa-2016.05.001/src/elpa2/mod_pack_unpack_gpu.F90 elpa-2019.11.001/src/elpa2/mod_pack_unpack_gpu.F90
--- elpa-2016.05.001/src/elpa2/mod_pack_unpack_gpu.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/mod_pack_unpack_gpu.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,110 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+
+module pack_unpack_gpu
+#include "config-f90.h"
+  implicit none
+
+  private
+
+  public pack_row_group_real_gpu_double, unpack_row_group_real_gpu_double, &
+         unpack_and_prepare_row_group_real_gpu_double, compute_hh_dot_products_real_gpu_double, &
+         extract_hh_tau_real_gpu_double
+
+  public pack_row_group_complex_gpu_double, unpack_row_group_complex_gpu_double, &
+         unpack_and_prepare_row_group_complex_gpu_double, compute_hh_dot_products_complex_gpu_double, &
+         extract_hh_tau_complex_gpu_double
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public pack_row_group_real_gpu_single, unpack_row_group_real_gpu_single, &
+         unpack_and_prepare_row_group_real_gpu_single, compute_hh_dot_products_real_gpu_single, &
+         extract_hh_tau_real_gpu_single
+
+#endif
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  public pack_row_group_complex_gpu_single, unpack_row_group_complex_gpu_single, &
+         unpack_and_prepare_row_group_complex_gpu_single, compute_hh_dot_products_complex_gpu_single, &
+         extract_hh_tau_complex_gpu_single
+#endif
+  contains
+
+  !real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "pack_unpack_gpu.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+ ! real single precision
+#if defined(WANT_SINGLE_PRECISION_REAL)
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "pack_unpack_gpu.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+  !complex double precision
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "pack_unpack_gpu.F90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+ ! complex single precision
+#if defined(WANT_SINGLE_PRECISION_COMPLEX)
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "pack_unpack_gpu.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#endif
+
+
+
+end module
diff -Nru elpa-2016.05.001/src/elpa2/mod_redist_band.F90 elpa-2019.11.001/src/elpa2/mod_redist_band.F90
--- elpa-2016.05.001/src/elpa2/mod_redist_band.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/mod_redist_band.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,114 @@
+!   This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+
+
+! ELPA2 -- 2-stage solver for ELPA
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+
+#include "config-f90.h"
+module redist
+
+  public
+
+  contains
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "redist_band.F90"
+
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+! single precision
+#ifdef WANT_SINGLE_PRECISION_REAL
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "redist_band.F90"
+
+#undef REALCASE
+#undef SINGLE_PRECISION
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+! double precision
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "redist_band.F90"
+
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+
+#include "../general/precision_macros.h"
+#include "redist_band.F90"
+
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+
+
+end module redist
+
diff -Nru elpa-2016.05.001/src/elpa2/pack_unpack_cpu.F90 elpa-2019.11.001/src/elpa2/pack_unpack_cpu.F90
--- elpa-2016.05.001/src/elpa2/pack_unpack_cpu.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/pack_unpack_cpu.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,231 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+#endif
+
+        subroutine pack_row_&
+  &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+  &_cpu_openmp_&
+#else
+        &_cpu_&
+#endif
+        &PRECISION &
+  (obj, a, row, n, stripe_width,  &
+#ifdef WITH_OPENMP
+  stripe_count, max_threads, thread_width, l_nev)
+#else
+  last_stripe_width, stripe_count)
+#endif
+          use elpa_abstract_impl
+          use precision
+          implicit none
+          class(elpa_abstract_impl_t), intent(inout) :: obj
+
+          integer(kind=ik), intent(in)               :: n, stripe_count, stripe_width
+#ifdef WITH_OPENMP
+          integer(kind=ik), intent(in)               :: max_threads, thread_width, l_nev
+          logical                                    :: useOPENMP
+
+#if REALCASE == 1
+          real(kind=C_DATATYPE_KIND), intent(in)     :: a(:,:,:,:)
+#endif
+#if COMPLEXCASE == 1
+          complex(kind=C_DATATYPE_KIND), intent(in)  :: a(:,:,:,:)
+#endif
+
+#else /* WITH_OPENMP */
+          integer(kind=ik), intent(in)               :: last_stripe_width
+#if REALCASE == 1
+          real(kind=C_DATATYPE_KIND), intent(in)     :: a(:,:,:)
+#endif
+#if COMPLEXCASE == 1
+          complex(kind=C_DATATYPE_KIND), intent(in)  :: a(:,:,:)
+#endif
+
+#endif /* WITH_OPENMP */
+
+#if REALCASE == 1
+          real(kind=C_DATATYPE_KIND)                 :: row(:)
+#endif
+#if COMPLEXCASE == 1
+          complex(kind=C_DATATYPE_KIND)              :: row(:)
+#endif
+
+          integer(kind=ik)                           :: i, noff, nl
+#ifdef WITH_OPENMP
+          integer(kind=ik)                           :: nt
+#endif
+
+          call obj%timer%start("pack_row_&
+    &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+          &_cpu_openmp" // &
+#else
+          &_cpu" // &
+#endif
+          &PRECISION_SUFFIX &
+    )
+
+#ifdef WITH_OPENMP
+          do nt = 1, max_threads
+            do i = 1, stripe_count
+              noff = (nt-1)*thread_width + (i-1)*stripe_width
+              nl   = min(stripe_width, nt*thread_width-noff, l_nev-noff)
+              if (nl<=0) exit
+              row(noff+1:noff+nl) = a(1:nl,n,i,nt)
+            enddo
+          enddo
+#else
+          do i=1,stripe_count
+            nl = merge(stripe_width, last_stripe_width, i<stripe_count)
+            noff = (i-1)*stripe_width
+            row(noff+1:noff+nl) = a(1:nl,n,i)
+          enddo
+#endif
+
+          call obj%timer%stop("pack_row_&
+    &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+          &_cpu_openmp" // &
+#else
+          &_cpu" // &
+#endif
+          &PRECISION_SUFFIX &
+    )
+
+        end subroutine
+
+        subroutine unpack_row_&
+  &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+        &_cpu_openmp_&
+#else
+        &_cpu_&
+#endif
+       &PRECISION &
+       (obj, a, row, n, &
+#ifdef WITH_OPENMP
+        my_thread, &
+#endif
+        stripe_count, &
+#ifdef WITH_OPENMP
+        thread_width, &
+#endif
+         stripe_width, &
+#ifdef WITH_OPENMP
+         l_nev)
+#else
+         last_stripe_width)
+#endif
+          use elpa_abstract_impl
+          use precision
+          implicit none
+          class(elpa_abstract_impl_t), intent(inout) :: obj
+          integer(kind=ik), intent(in)               :: n, stripe_count, stripe_width
+
+#ifdef WITH_OPENMP
+          ! Private variables in OMP regions (my_thread) should better be in the argument list!
+          integer(kind=ik), intent(in)               :: thread_width, l_nev, my_thread
+#if REALCASE == 1
+          real(kind=C_DATATYPE_KIND)                 :: a(:,:,:,:)
+#endif
+#if COMPLEXCASE == 1
+          complex(kind=C_DATATYPE_KIND)              :: a(:,:,:,:)
+
+#endif
+#else /* WITH_OPENMP */
+          integer(kind=ik), intent(in)               :: last_stripe_width
+#if REALCASE == 1
+          real(kind=C_DATATYPE_KIND)                 :: a(:,:,:)
+#endif
+#if COMPLEXCASE == 1
+          complex(kind=C_DATATYPE_KIND)              :: a(:,:,:)
+#endif
+
+#endif /* WITH_OPENMP */
+
+#if REALCASE == 1
+          real(kind=C_DATATYPE_KIND), intent(in)     :: row(:)
+#endif
+#if COMPLEXCASE == 1
+          complex(kind=C_DATATYPE_KIND), intent(in)  :: row(:)
+#endif
+          integer(kind=ik)                           :: i, noff, nl
+
+          call obj%timer%start("unpack_row_&
+    &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+    &_cpu_opemp" // &
+#else
+          &_cpu" // &
+#endif
+          &PRECISION_SUFFIX &
+    )
+
+          do i=1,stripe_count
+#ifdef WITH_OPENMP
+            noff = (my_thread-1)*thread_width + (i-1)*stripe_width
+            nl   = min(stripe_width, my_thread*thread_width-noff, l_nev-noff)
+      if( nl<= 0) exit
+            a(1:nl,n,i,my_thread) = row(noff+1:noff+nl)
+#else
+            nl = merge(stripe_width, last_stripe_width, i<stripe_count)
+            noff = (i-1)*stripe_width
+      a(1:nl,n,i) = row(noff+1:noff+nl)
+#endif
+
+          enddo
+
+          call obj%timer%stop("unpack_row_&
+    &MATH_DATATYPE&
+#ifdef WITH_OPENMP
+    &_cpu_opemp" // &
+#else
+          &_cpu" // &
+#endif
+          &PRECISION_SUFFIX &
+    )
+
+        end subroutine
+
diff -Nru elpa-2016.05.001/src/elpa2/pack_unpack_gpu.F90 elpa-2019.11.001/src/elpa2/pack_unpack_gpu.F90
--- elpa-2016.05.001/src/elpa2/pack_unpack_gpu.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/pack_unpack_gpu.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,297 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+#endif
+
+    ! Pack a filled row group (i.e. an array of consecutive rows)
+
+    subroutine pack_row_group_&
+    &MATH_DATATYPE&
+    &_gpu_&
+    &PRECISION &
+    (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
+                                       rows, n_offset, row_count)
+      use cuda_c_kernel
+      use cuda_functions
+      use precision
+      use iso_c_binding
+      implicit none
+      integer(kind=c_intptr_t)     :: row_group_dev, a_dev
+
+      integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
+      integer(kind=ik), intent(in) :: n_offset, row_count
+#if REALCASE == 1
+      real(kind=C_DATATYPE_KIND)   :: rows(:,:)
+#endif
+#if COMPLEXCASE == 1
+      complex(kind=C_DATATYPE_KIND) :: rows(:,:)
+#endif
+      integer(kind=ik)             :: max_idx
+      logical                      :: successCUDA
+
+      ! Use many blocks for higher GPU occupancy
+      max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
+
+      ! Use one kernel call to pack the entire row group
+
+!      call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
+
+      call launch_my_pack_gpu_kernel_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION &
+      (row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev)
+
+      ! Issue one single transfer call for all rows (device to host)
+!        rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
+
+      successCUDA =  cuda_memcpy(int(loc(rows(:, 1: row_count)),kind=c_intptr_t), row_group_dev , row_count * l_nev * size_of_&
+      &PRECISION&
+      &_&
+      &MATH_DATATYPE&
+      & , cudaMemcpyDeviceToHost)
+      if (.not.(successCUDA)) then
+        print *,"pack_row_group_&
+        &MATH_DATATYPE&
+        &_gpu_&
+        &PRECISION&
+        &: error in cudaMemcpy"
+        stop 1
+      endif
+
+    end subroutine
+
+
+    ! Unpack a filled row group (i.e. an array of consecutive rows)
+    subroutine unpack_row_group_&
+    &MATH_DATATYPE&
+    &_gpu_&
+    &PRECISION &
+    (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
+                                         a_dim2, l_nev, rows, n_offset, row_count)
+      use cuda_c_kernel
+      use precision
+      use iso_c_binding
+      use cuda_functions
+      implicit none
+      integer(kind=c_intptr_t)                     :: row_group_dev, a_dev
+      integer(kind=ik), intent(in)                 :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
+      integer(kind=ik), intent(in)                 :: n_offset, row_count
+#if REALCASE == 1
+      real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
+#endif
+#if COMPLEXCASE == 1
+      complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
+#endif
+
+      integer(kind=ik)                             :: max_idx
+      logical                                      :: successCUDA
+
+      ! Use many blocks for higher GPU occupancy
+      max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
+
+      ! Issue one single transfer call for all rows (host to device)
+!      row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
+
+
+      successCUDA =  cuda_memcpy( row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * &
+                                 size_of_&
+                                 &PRECISION&
+                                 &_&
+                                 &MATH_DATATYPE&
+                                 &, cudaMemcpyHostToDevice)
+      if (.not.(successCUDA)) then
+        print *,"unpack_row_group_&
+        &MATH_DATATYPE&
+        &_gpu_&
+        &PRECISION&
+        &: error in cudaMemcpy"
+        stop 1
+      endif
+
+      ! Use one kernel call to pack the entire row group
+      !        call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
+
+      call launch_my_unpack_gpu_kernel_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION &
+      ( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
+                                          row_group_dev,a_dev)
+
+    end subroutine
+
+    ! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
+    ! occurs when the queue is full or when the next row belongs to another group
+    subroutine unpack_and_prepare_row_group_&
+    &MATH_DATATYPE&
+    &_gpu_&
+    &PRECISION &
+    (row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
+                                                     last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
+                                                     unpack_idx, next_unpack_idx, force)
+
+      use iso_c_binding
+      use precision
+      implicit none
+#if REALCASE == 1
+      real(kind=C_DATATYPE_KIND)      :: row_group(:,:)
+#endif
+#if COMPLEXCASE == 1
+      complex(kind=C_DATATYPE_KIND)   :: row_group(:,:)
+#endif
+      integer(kind=c_intptr_t)        :: row_group_dev, a_dev
+      integer(kind=ik), intent(in)    :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
+      integer(kind=ik), intent(inout) :: row_group_size
+      integer(kind=ik), intent(in)    :: nblk
+      integer(kind=ik), intent(inout) :: unpack_idx
+      integer(kind=ik), intent(in)    :: next_unpack_idx
+      logical, intent(in)             :: force
+
+      if (row_group_size == 0) then
+        ! Nothing to flush, just prepare for the upcoming row
+        row_group_size = 1
+      else
+        if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
+          ! A flush and a reset must be performed
+          call unpack_row_group_&
+          &MATH_DATATYPE&
+          &_gpu_&
+          &PRECISION&
+          (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
+                                         a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size)
+          row_group_size = 1
+        else
+          ! Just prepare for the upcoming row
+          row_group_size = row_group_size + 1
+        endif
+      endif
+      ! Always update the index for the upcoming row
+      unpack_idx = next_unpack_idx
+    end subroutine
+
+    ! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
+    subroutine compute_hh_dot_products_&
+    &MATH_DATATYPE&
+    &_gpu_&
+    &PRECISION&
+    & (bcast_buffer_dev, hh_dot_dev, nbw, n)
+      use cuda_c_kernel
+      use precision
+      use iso_c_binding
+      implicit none
+      integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_dot_dev
+      integer(kind=ik), value  :: nbw, n
+
+      if (n .le. 1) return
+      call launch_compute_hh_dotp_gpu_kernel_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      & ( bcast_buffer_dev, hh_dot_dev, nbw, n)
+
+
+    end subroutine
+
+    ! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
+    subroutine extract_hh_tau_&
+    &MATH_DATATYPE&
+    &_gpu_&
+    &PRECISION&
+    & (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
+      use cuda_c_kernel
+      use precision
+      use iso_c_binding
+      implicit none
+      integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
+      integer(kind=ik), value  :: nbw, n
+      logical, value           :: is_zero
+      integer(kind=ik)         :: val_is_zero
+      if (is_zero) then
+      val_is_zero = 1
+      else
+       val_is_zero = 0
+      endif
+
+      call launch_extract_hh_tau_gpu_kernel_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      & (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
+    end subroutine
+
+    ! -------------------------------------------
+    ! Fortran back-transformation support kernels
+    ! -------------------------------------------
+
+    ! Reset a reduction block
+    ! Limitation: the thread-block size must be a divider of the reduction block's size
+    ! Reset 2 reduction blocks without an explicit synchronization at the end
+    ! Limitation: : the thread-block size must be a divider of the reduction block's size
+    ! Perform a reduction on an initialized, 128-element shared block
+    ! Compute the dot-product between 2 consecutive HH vectors
+    ! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
+    ! Limitation 2: the size of the warp must be equal to 32
+    !
+    ! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
+    ! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
+    !
+    ! -------------------------------------------
+    ! Fortran back-transformation support kernels
+    ! -------------------------------------------
+    !
+    ! This is the simplest and slowest available backtransformation kernel
+    !
+    ! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
+    ! 2 Householder reflectors per iteration
+    !
+    ! ---------------------------------
+    ! Row packing and unpacking kernels
+    ! ---------------------------------
+    !
+    ! The row group packing kernel
+
+        ! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
+        ! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)
+        ! - "compute_hh_trafo_kernel" is the Fortran equivalent of the C kernel
+        ! - "compute_hh_trafo_single_kernel" is the reference Fortran kernel
+
diff -Nru elpa-2016.05.001/src/elpa2/qr/elpa_pdgeqrf.F90 elpa-2019.11.001/src/elpa2/qr/elpa_pdgeqrf.F90
--- elpa-2016.05.001/src/elpa2/qr/elpa_pdgeqrf.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/qr/elpa_pdgeqrf.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,91 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+
+#include "config-f90.h"
+
+module elpa_pdgeqrf
+
+  use elpa_utilities
+  use elpa1_compute
+  use elpa_pdlarfb
+  use qr_utils_mod
+  use elpa_qrkernels
+  use elpa_mpi
+  implicit none
+
+  PRIVATE
+
+  public :: qr_pdgeqrf_2dcomm_double
+  public :: qr_pqrparam_init
+  public :: qr_pdlarfg2_1dcomm_check_double
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  public :: qr_pdgeqrf_2dcomm_single
+  public :: qr_pdlarfg2_1dcomm_check_single
+#endif
+
+
+  contains
+  ! real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#undef ALREADY_DEFINED
+#include "../../general/precision_macros.h"
+#include "elpa_pdgeqrf_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+#define ALREADY_DEFINED
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  ! real single precision
+#define REALCASE 1
+#define ALREADY_DEFINED
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "elpa_pdgeqrf_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+
+end module elpa_pdgeqrf
diff -Nru elpa-2016.05.001/src/elpa2/qr/elpa_pdgeqrf_template.F90 elpa-2019.11.001/src/elpa2/qr/elpa_pdgeqrf_template.F90
--- elpa-2016.05.001/src/elpa2/qr/elpa_pdgeqrf_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/qr/elpa_pdgeqrf_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,3037 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+     subroutine qr_pdgeqrf_2dcomm_&
+           &PRECISION &
+           (obj, a, lda, matrixCols, v, ldv, vmrCols, tau, lengthTau, t, ldt, colsT, &
+                                  work, workLength, lwork, m, n, mb, nb, rowidx, colidx, &
+                                  rev, trans, PQRPARAM, mpicomm_rows, mpicomm_cols, blockheuristic)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+      INTEGER(kind=ik), parameter   :: gmode_ = 1, rank_ = 2, eps_ = 3
+
+      ! input variables (local)
+      integer(kind=ik), intent(in)  :: lda, lwork, ldv, ldt, matrixCols, m, vmrCols, lengthTau, &
+                                       colsT, workLength
+
+      ! input variables (global)
+      integer(kind=ik)              :: n, mb, nb, rowidx, colidx, rev, trans, mpicomm_cols, mpicomm_rows
+#ifdef USE_ASSUMED_SIZE_QR
+      integer(kind=ik)              :: PQRPARAM(*)
+      real(kind=C_DATATYPE_KIND)                 :: a(lda,*), v(ldv,*), tau(*), t(ldt,*), work(*)
+#else
+      integer(kind=ik)              :: PQRPARAM(1:11)
+      real(kind=C_DATATYPE_KIND)                 :: a(1:lda,1:matrixCols), v(1:ldv,1:vmrCols), tau(1:lengthTau), &
+                                       t(1:ldt,1:colsT), work(1:workLength)
+#endif
+      ! output variables (global)
+      real(kind=C_DATATYPE_KIND)                 :: blockheuristic(*)
+
+      ! input variables derived from PQRPARAM
+      integer(kind=ik)              :: updatemode,tmerge,size2d
+
+      ! local scalars
+      integer(kind=ik)              :: mpirank_cols,broadcast_size,mpirank_rows
+      integer(kind=MPI_KIND)        :: mpirank_colsMPI, mpirank_rowsMPI
+      integer(kind=MPI_KIND)        :: mpierr
+      integer(kind=ik)              :: mpirank_cols_qr,mpiprocs_cols
+      integer(kind=MPI_KIND)        :: mpiprocs_colsMPI
+      integer(kind=ik)              :: lcols_temp,lcols,icol,lastcol
+      integer(kind=ik)              :: baseoffset,offset,idx,voffset
+      integer(kind=ik)              :: update_voffset,update_tauoffset
+      integer(kind=ik)              :: update_lcols
+      integer(kind=ik)              :: work_offset
+
+      real(kind=C_DATATYPE_KIND)                 :: dbroadcast_size(1),dtmat_bcast_size(1)
+      real(kind=C_DATATYPE_KIND)                 :: pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1)
+      integer(kind=ik)              :: temptau_offset,temptau_size,broadcast_offset,tmat_bcast_size
+      integer(kind=ik)              :: remaining_cols
+      integer(kind=ik)              :: total_cols
+      integer(kind=ik)              :: incremental_update_size ! needed for incremental update mode
+
+      call obj%timer%start("qr_pdgeqrf_2dcomm_&
+          &PRECISION&
+          &")
+      size2d     = PQRPARAM(1)
+      updatemode = PQRPARAM(2)
+      tmerge     = PQRPARAM(3)
+
+      ! copy value before we are going to filter it
+      total_cols = n
+      call mpi_comm_rank(int(mpicomm_cols,kind=MPI_KIND) ,mpirank_colsMPI, mpierr)
+      call mpi_comm_rank(int(mpicomm_rows,kind=MPI_KIND) ,mpirank_rowsMPI, mpierr)
+      call mpi_comm_size(int(mpicomm_cols,kind=MPI_KIND) ,mpiprocs_colsMPI, mpierr)
+
+      mpirank_cols = int(mpirank_colsMPI,kind=c_int)
+      mpirank_rows = int(mpirank_rowsMPI,kind=c_int)
+      mpiprocs_cols = int(mpiprocs_colsMPI,kind=c_int)
+
+#ifdef USE_ASSUMED_SIZE_QR
+      call qr_pdgeqrf_1dcomm_&
+          &PRECISION &
+          (obj,a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
+                             PQRPARAM(4),mpicomm_rows,blockheuristic)
+#else
+      call qr_pdgeqrf_1dcomm_&
+          &PRECISION &
+          (obj,a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
+                             PQRPARAM(4:11),mpicomm_rows,blockheuristic)
+#endif
+      call qr_pdgeqrf_pack_unpack_&
+          &PRECISION &
+          (obj,v,ldv,dbroadcast_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,0,mpicomm_rows)
+      call qr_pdgeqrf_pack_unpack_tmatrix_&
+          &PRECISION &
+          (obj,tau,t,ldt,dtmat_bcast_size(1),-1,total_cols,0)
+
+#ifdef DOUBLE_PRECISION_REAL
+      pdlarft_size(1) = 0.0_rk8
+#else
+      pdlarft_size(1) = 0.0_rk4
+#endif
+
+      call qr_pdlarfb_1dcomm_&
+          &PRECISION &
+          (m,mb,total_cols,total_cols,a,lda,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows, &
+                             pdlarfb_size(1),-1)
+      call qr_tmerge_pdlarfb_1dcomm_&
+          &PRECISION &
+          (m,mb,total_cols,total_cols,total_cols,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode, &
+                                    mpicomm_rows,tmerge_pdlarfb_size(1),-1)
+
+
+      temptau_offset = 1
+      temptau_size = total_cols
+      broadcast_offset = temptau_offset + temptau_size
+      broadcast_size = int(dbroadcast_size(1) + dtmat_bcast_size(1))
+      work_offset = broadcast_offset + broadcast_size
+
+      if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1) = (real(temptau_size,kind=C_DATATYPE_KIND) + real(broadcast_size,kind=C_DATATYPE_KIND) + max(pdgeqrf_size(1), &
+            pdlarft_size(1),pdlarfb_size(1), &
+                   tmerge_pdlarfb_size(1)))
+#else
+        work(1) = (real(temptau_size,kind=rk4) + real(broadcast_size,kind=rk4) + max(pdgeqrf_size(1), &
+            pdlarft_size(1),pdlarfb_size(1), &
+                   tmerge_pdlarfb_size(1)))
+#endif
+        call obj%timer%stop("qr_pdgeqrf_2dcomm_&
+          &PRECISION&
+          &")
+        return
+      end if
+
+      lastcol = colidx-total_cols+1
+      voffset = total_cols
+
+      incremental_update_size = 0
+
+      ! clear v buffer: just ensure that there is no junk in the upper triangle
+      ! part, otherwise pdlarfb gets some problems
+      ! pdlarfl(2) do not have these problems as they are working more on a Vector
+      ! basis
+#ifdef DOUBLE_PRECISION_REAL
+      v(1:ldv,1:total_cols) = 0.0_rk8
+#else
+      v(1:ldv,1:total_cols) = 0.0_rk4
+#endif
+      icol = colidx
+
+      remaining_cols = total_cols
+
+      !print *,'start decomposition',m,rowidx,colidx
+
+      do while (remaining_cols .gt. 0)
+
+        ! determine rank of process column with next qr block
+        mpirank_cols_qr = MOD((icol-1)/nb,mpiprocs_cols)
+
+        ! lcols can't be larger than than nb
+        ! exception: there is only one process column
+
+        ! however, we might not start at the first local column.
+        ! therefore assume a matrix of size (1xlcols) starting at (1,icol)
+        ! determine the real amount of local columns
+        lcols_temp = min(nb,(icol-lastcol+1))
+
+        ! blocking parameter
+        lcols_temp = max(min(lcols_temp,size2d),1)
+
+        ! determine size from last decomposition column
+        !  to first decomposition column
+        call local_size_offset_1d(icol,nb,icol-lcols_temp+1,icol-lcols_temp+1,0, &
+                                      mpirank_cols_qr,mpiprocs_cols, &
+                                      lcols,baseoffset,offset)
+
+        voffset = remaining_cols - lcols + 1
+
+        idx = rowidx - colidx + icol
+
+        if (mpirank_cols .eq. mpirank_cols_qr) then
+          ! qr decomposition part
+#ifdef DOUBLE_PRECISION_REAL
+          tau(offset:offset+lcols-1) = 0.0_rk8
+#else
+          tau(offset:offset+lcols-1) = 0.0_rk4
+#endif
+
+#ifdef USE_ASSUMED_SIZE_QR
+          call qr_pdgeqrf_1dcomm_&
+              &PRECISION &
+              (obj,a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
+                                 work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4), &
+                                 mpicomm_rows,blockheuristic)
+
+#else
+          call qr_pdgeqrf_1dcomm_&
+              &PRECISION &
+              (obj,a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
+                                 work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4:11), &
+                                 mpicomm_rows,blockheuristic)
+#endif
+
+          ! pack broadcast buffer (v + tau)
+          call qr_pdgeqrf_pack_unpack_&
+              &PRECISION &
+              (obj,v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,&
+                                      idx,rev,0,mpicomm_rows)
+
+          ! determine broadcast size
+          call qr_pdgeqrf_pack_unpack_&
+              &PRECISION &
+              (obj,v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,&
+                                      0,mpicomm_rows)
+          broadcast_size = int(dbroadcast_size(1))
+
+          !if (mpirank_rows .eq. 0) then
+          ! pack tmatrix into broadcast buffer and calculate new size
+          call qr_pdgeqrf_pack_unpack_tmatrix_&
+              &PRECISION &
+              (obj,tau(offset),t(voffset,voffset),ldt, &
+                                              work(broadcast_offset+broadcast_size),lwork,lcols,0)
+          call qr_pdgeqrf_pack_unpack_tmatrix_&
+              &PRECISION &
+              (obj,tau(offset),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0)
+          broadcast_size = broadcast_size + int(dtmat_bcast_size(1))
+          !end if
+
+          ! initiate broadcast (send part)
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+          call MPI_Bcast(work(broadcast_offset), int(broadcast_size,kind=MPI_KIND), mpi_real8, &
+                         int(mpirank_cols_qr,kind=MPI_KIND), int(mpicomm_cols,kind=MPI_KIND), mpierr)
+#else
+          call MPI_Bcast(work(broadcast_offset), int(broadcast_size,kind=MPI_KIND), mpi_real4, &
+                         int(mpirank_cols_qr,kind=MPI_KIND), int(mpicomm_cols,kind=MPI_KIND), mpierr)
+#endif
+
+#endif
+          ! copy tau parts into temporary tau buffer
+          work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)
+
+          !print *,'generated tau:', tau(offset)
+        else
+          ! Vector exchange part
+
+          ! determine broadcast size
+          call qr_pdgeqrf_pack_unpack_&
+              &PRECISION &
+              (obj,v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
+          broadcast_size = int(dbroadcast_size(1))
+
+          call qr_pdgeqrf_pack_unpack_tmatrix_&
+              &PRECISION &
+              (obj,work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
+                                              dtmat_bcast_size(1),-1,lcols,0)
+          tmat_bcast_size = dtmat_bcast_size(1)
+
+          !print *,'broadcast_size (nonqr)',broadcast_size
+          broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
+
+          ! initiate broadcast (recv part)
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+          call MPI_Bcast(work(broadcast_offset), int(broadcast_size,kind=MPI_KIND), mpi_real8, &
+                         int(mpirank_cols_qr,kind=MPI_KIND), int(mpicomm_cols,kind=MPI_KIND), mpierr)
+#else
+          call MPI_Bcast(work(broadcast_offset), int(broadcast_size,kind=MPI_KIND), mpi_real4, &
+                         int(mpirank_cols_qr,kind=MPI_KIND), int(mpicomm_cols,kind=MPI_KIND), mpierr)
+#endif
+
+#endif
+          ! last n*n elements in buffer are (still empty) T matrix elements
+          ! fetch from first process in each column
+
+          ! unpack broadcast buffer (v + tau)
+          call qr_pdgeqrf_pack_unpack_&
+              &PRECISION &
+              (obj,v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols, &
+              mb,rowidx,idx,rev,1,mpicomm_rows)
+
+          ! now send t matrix to other processes in our process column
+          broadcast_size = int(dbroadcast_size(1))
+          tmat_bcast_size = int(dtmat_bcast_size(1))
+
+          ! t matrix should now be available on all processes => unpack
+          call qr_pdgeqrf_pack_unpack_tmatrix_&
+              &PRECISION &
+              (obj,work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
+                                              work(broadcast_offset+broadcast_size),lwork,lcols,1)
+        end if
+
+        remaining_cols = remaining_cols - lcols
+
+        ! apply householder vectors to whole trailing matrix parts (if any)
+
+        update_voffset = voffset
+        update_tauoffset = icol
+        update_lcols = lcols
+        incremental_update_size = incremental_update_size + lcols
+
+        icol = icol - lcols
+        ! count colums from first column of global block to current index
+        call local_size_offset_1d(icol,nb,colidx-n+1,colidx-n+1,0, &
+                                      mpirank_cols,mpiprocs_cols, &
+                                      lcols,baseoffset,offset)
+
+        if (lcols .gt. 0) then
+
+          !print *,'updating trailing matrix'
+
+           if (updatemode .eq. ichar('I')) then
+             print *,'pdgeqrf_2dcomm: incremental update not yet implemented! rev=1'
+           else if (updatemode .eq. ichar('F')) then
+             ! full update no merging
+             call qr_pdlarfb_1dcomm_&
+                &PRECISION &
+                (m,mb,lcols,update_lcols,a(1,offset),lda,v(1,update_voffset),ldv, &
+                     work(temptau_offset+update_voffset-1),                          &
+                                                        t(update_voffset,update_voffset),ldt, &
+                    rowidx,idx,1,mpicomm_rows,work(work_offset),lwork)
+           else
+            ! full update + merging default
+             call qr_tmerge_pdlarfb_1dcomm_&
+                &PRECISION &
+                (m,mb,lcols,n-(update_voffset+update_lcols-1),update_lcols, &
+                                                              v(1,update_voffset),ldv, &
+                          t(update_voffset,update_voffset),ldt, &
+                          a(1,offset),lda,rowidx,1,updatemode,mpicomm_rows, &
+                                                              work(work_offset),lwork)
+           end if
+        else
+           if (updatemode .eq. ichar('I')) then
+             !print *,'sole merging of (incremental) T matrix', mpirank_cols,  &
+            !                            n-(update_voffset+incremental_update_size-1)
+             call qr_tmerge_pdlarfb_1dcomm_&
+                &PRECISION &
+                (m,mb,0,n-(update_voffset+incremental_update_size-1),   &
+                                                              incremental_update_size,v(1,update_voffset),ldv, &
+                          t(update_voffset,update_voffset),ldt, &
+                          a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
+
+             ! reset for upcoming incremental updates
+             incremental_update_size = 0
+          else if (updatemode .eq. ichar('M')) then
+             ! final merge
+            call qr_tmerge_pdlarfb_1dcomm_&
+                &PRECISION &
+                (m,mb,0,n-(update_voffset+update_lcols-1),update_lcols, &
+                                                              v(1,update_voffset),ldv, &
+                          t(update_voffset,update_voffset),ldt, &
+                          a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
+          else
+            ! full updatemode - nothing to update
+          end if
+
+          ! reset for upcoming incremental updates
+          incremental_update_size = 0
+        end if
+      end do
+
+      if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
+        ! finally merge all small T parts
+        call qr_pdlarft_tree_merge_1dcomm_&
+&PRECISION &
+(m,mb,n,size2d,tmerge,v,ldv,t,ldt,rowidx,rev,mpicomm_rows,work,lwork)
+      end if
+
+      !print *,'stop decomposition',rowidx,colidx
+      call obj%timer%stop("qr_pdgeqrf_2dcomm_&
+          &PRECISION&
+          &")
+    end subroutine
+
+    subroutine qr_pdgeqrf_1dcomm_&
+&PRECISION &
+(obj,a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans, &
+          PQRPARAM,mpicomm,blockheuristic)
+      use precision
+      use elpa1_impl
+      use elpa_abstract_impl
+      implicit none
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+      INTEGER(kind=ik), parameter  :: gmode_ = 1,rank_ = 2,eps_ = 3
+
+      ! input variables (local)
+      integer(kind=ik)             :: lda,lwork,ldv,ldt
+      real(kind=C_DATATYPE_KIND)                :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
+
+      ! input variables (global)
+      integer(kind=ik)             :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
+#ifdef USE_ASSUMED_SIZE_QR
+      integer(kind=ik)             :: PQRPARAM(*)
+
+#else
+      integer(kind=ik)             :: PQRPARAM(:)
+#endif
+      ! derived input variables
+
+      ! derived further input variables from QR_PQRPARAM
+      integer(kind=ik)             :: size1d,updatemode,tmerge
+
+      ! output variables (global)
+      real(kind=C_DATATYPE_KIND)                :: blockheuristic(*)
+
+      ! local scalars
+      integer(kind=ik)             :: nr_blocks,remainder,current_block,aoffset,idx,updatesize
+      real(kind=C_DATATYPE_KIND)                :: pdgeqr2_size(1),pdlarfb_size(1),tmerge_tree_size(1)
+      call obj%timer%start("qr_pdgeqrf_1dcomm_&
+          &PRECISION&
+          &")
+      size1d     = max(min(PQRPARAM(1),n),1)
+      updatemode = PQRPARAM(2)
+      tmerge     = PQRPARAM(3)
+
+      if (lwork .eq. -1) then
+#ifdef USE_ASSUMED_SIZE_QR
+        call qr_pdgeqr2_1dcomm_&
+&PRECISION &
+(obj,a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
+                                  m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4),mpicomm,blockheuristic)
+#else
+        call qr_pdgeqr2_1dcomm_&
+&PRECISION &
+(obj,a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
+                                  m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4:),mpicomm,blockheuristic)
+#endif
+        ! reserve more space for incremental mode
+        call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,n,n,n,v,ldv,t,ldt, &
+                                         a,lda,baseidx,rev,updatemode,mpicomm,pdlarfb_size,-1)
+
+        call qr_pdlarft_tree_merge_1dcomm_&
+&PRECISION &
+(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,tmerge_tree_size,-1)
+
+        work(1) = max(pdlarfb_size(1),pdgeqr2_size(1),tmerge_tree_size(1))
+      call obj%timer%stop("qr_pdgeqrf_1dcomm_&
+          &PRECISION&
+          &")
+        return
+      end if
+
+      nr_blocks = n / size1d
+      remainder = n - nr_blocks*size1d
+
+      current_block = 0
+      do while (current_block .lt. nr_blocks)
+        idx = rowidx-current_block*size1d
+        updatesize = n-(current_block+1)*size1d
+        aoffset = 1+updatesize
+#ifdef USE_ASSUMED_SIZE_QR
+        call qr_pdgeqr2_1dcomm_&
+&PRECISION &
+(obj,a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
+                                m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
+
+#else
+        call qr_pdgeqr2_1dcomm_&
+&PRECISION &
+(obj,a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
+                                m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
+#endif
+        if (updatemode .eq. ichar('M')) then
+          ! full update + merging
+          call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,updatesize,current_block*size1d,size1d, &
+                                           v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
+                                           a,lda,baseidx,1,ichar('F'),mpicomm,work,lwork)
+        else if (updatemode .eq. ichar('I')) then
+          if (updatesize .ge. size1d) then
+            ! incremental update + merging
+            call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,size1d,current_block*size1d,size1d, &
+                                               v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
+                                               a(1,aoffset-size1d),lda,baseidx,1,updatemode,mpicomm,work,lwork)
+
+          else ! only remainder left
+             ! incremental update + merging
+             call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,remainder,current_block*size1d,size1d, &
+                                               v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
+                                               a(1,1),lda,baseidx,1,updatemode,mpicomm,work,lwork)
+          end if
+        else ! full update no merging is default
+          ! full update no merging
+          call qr_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,updatesize,size1d,a,lda,v(1,aoffset),ldv, &
+                                    tau(aoffset),t(aoffset,aoffset),ldt,baseidx,idx,1,mpicomm,work,lwork)
+        end if
+
+        ! move on to next block
+        current_block = current_block+1
+      end do
+
+      if (remainder .gt. 0) then
+        aoffset = 1
+        idx = rowidx-size1d*nr_blocks
+#ifdef USE_ASSUMED_SIZE_QR
+        call qr_pdgeqr2_1dcomm_&
+&PRECISION &
+(obj,a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
+                                  m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
+
+#else
+        call qr_pdgeqr2_1dcomm_&
+&PRECISION &
+(obj,a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
+                                  m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
+#endif
+        if ((updatemode .eq. ichar('I')) .or. (updatemode .eq. ichar('M'))) then
+          ! final merging
+          call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,0,size1d*nr_blocks,remainder, &
+                                             v,ldv,t,ldt, &
+                                             a,lda,baseidx,1,updatemode,mpicomm,work,lwork) ! updatemode argument does not matter
+        end if
+      end if
+
+      if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
+        ! finally merge all small T parts
+        call qr_pdlarft_tree_merge_1dcomm_&
+&PRECISION &
+(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
+      end if
+      call obj%timer%stop("qr_pdgeqrf_1dcomm_&
+          &PRECISION&
+          &")
+
+    end subroutine
+
+    ! local a and tau are assumed to be positioned at the right column from a local
+    ! perspective
+    ! TODO: if local amount of data turns to zero the algorithm might produce wrong
+    ! results (probably due to old buffer contents)
+    subroutine qr_pdgeqr2_1dcomm_&
+&PRECISION &
+(obj,a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev, &
+          trans,PQRPARAM,mpicomm,blockheuristic)
+      use precision
+      !use elpa1_impl ! check this
+      use elpa_abstract_impl
+      implicit none
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+      INTEGER(kind=ik), parameter   :: gmode_ = 1,rank_ = 2 ,eps_ = 3, upmode1_ = 4
+
+      ! input variables (local)
+      integer(kind=ik)              :: lda,lwork,ldv,ldt
+      real(kind=C_DATATYPE_KIND)                 :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
+
+      ! input variables (global)
+      integer(kind=ik)              :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
+#ifdef USE_ASSUMED_SIZE_QR
+      integer(kind=ik)              :: PQRPARAM(*)
+#else
+      integer(kind=ik)              :: PQRPARAM(:)
+#endif
+      ! output variables (global)
+      real(kind=C_DATATYPE_KIND)                 :: blockheuristic(*)
+
+      ! derived further input variables from QR_PQRPARAM
+      integer(kind=ik)              ::  maxrank,hgmode,updatemode
+
+      ! local scalars
+      integer(kind=ik)              :: icol,incx,idx
+      real(kind=C_DATATYPE_KIND)                 :: pdlarfg_size(1),pdlarf_size(1),total_size
+      real(kind=C_DATATYPE_KIND)                 :: pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1)
+      real(kind=C_DATATYPE_KIND)                 :: pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1)
+      integer(kind=ik)              :: mpirank,mpiprocs
+      integer(kind=MPI_KIND)        :: mpirankMPI, mpiprocsMPI
+      integer(kind=MPI_KIND)        :: mpierr
+      integer(kind=ik)              :: rank,lastcol,actualrank,nextrank
+      integer(kind=ik)              :: update_cols,decomposition_cols
+      integer(kind=ik)              :: current_column
+      call obj%timer%start("qr_pdgeqr2_1dcomm_&
+          &PRECISION&
+          &")
+
+      maxrank    = min(PQRPARAM(1),n)
+      updatemode = PQRPARAM(2)
+      hgmode     = PQRPARAM(4)
+      call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+      call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+      mpirank = int(mpirankMPI,kind=c_int)
+      mpiprocs = int(mpiprocsMPI,kind=c_int)
+      if (trans .eq. 1) then
+        incx = lda
+      else
+        incx = 1
+      end if
+
+      if (lwork .eq. -1) then
+
+        call qr_pdlarfg_1dcomm_&
+&PRECISION &
+(obj,a,incx,tau(1),pdlarfg_size(1),-1,n,rowidx,mb,hgmode,rev,mpicomm)
+        call qr_pdlarfl_1dcomm_&
+&PRECISION &
+(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
+#ifdef USE_ASSUMED_SIZE_QR
+        call qr_pdlarfg2_1dcomm_ref_&
+&PRECISION &
+(obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM, &
+                                    rev,mpicomm,actualrank)
+
+        call qr_pdlarfgk_1dcomm_&
+&PRECISION &
+(obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
+
+#else
+        call qr_pdlarfg2_1dcomm_ref_&
+&PRECISION &
+(obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM(:), &
+                                    rev,mpicomm,actualrank)
+
+        call qr_pdlarfgk_1dcomm_&
+&PRECISION &
+(obj,a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n, &
+            rowidx,mb,PQRPARAM(:),rev,mpicomm,actualrank)
+#endif
+        call qr_pdlarfl2_tmatrix_1dcomm_&
+&PRECISION &
+(v,ldv,baseidx,a,lda,t,ldt,pdlarfl2_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
+#ifdef DOUBLE_PRECISION_REAL
+        pdlarft_size(1) = 0.0_rk8
+#else
+        pdlarft_size(1) = 0.0_rk4
+#endif
+        call qr_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,n,n,a,lda,v,ldv,tau,t,ldt,baseidx,rowidx,1,mpicomm,pdlarfb_size(1),-1)
+#ifdef DOUBLE_PRECISION_REAL
+        pdlarft_pdlarfb_size(1) = 0.0_rk8
+#else
+        pdlarft_pdlarfb_size(1) = 0.0_rk4
+#endif
+        call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,n,n,n,v,ldv,t,ldt,a,lda,rowidx,rev,&
+            updatemode,mpicomm,tmerge_pdlarfb_size(1),-1)
+
+        total_size = max(pdlarfg_size(1),pdlarf_size(1),pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1),pdlarft_size(1), &
+                         pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1))
+
+        work(1) = total_size
+      call obj%timer%stop("qr_pdgeqr2_1dcomm_&
+          &PRECISION&
+          &")
+        return
+      end if
+
+      icol = 1
+      lastcol = min(rowidx,n)
+      decomposition_cols = lastcol
+      update_cols = n
+      do while (decomposition_cols .gt. 0) ! local qr block
+        icol = lastcol-decomposition_cols+1
+        idx = rowidx-icol+1
+
+        ! get possible rank size
+        ! limited by number of columns and remaining rows
+        rank = min(n-icol+1,maxrank,idx)
+
+        current_column = n-icol+1-rank+1
+
+        if (rank .eq. 1) then
+
+          call qr_pdlarfg_1dcomm_&
+&PRECISION &
+(obj,a(1,current_column),incx, &
+                                  tau(current_column),work,lwork, &
+                                  m,idx,mb,hgmode,1,mpicomm)
+#ifdef DOUBLE_PRECISION_REAL
+          v(1:ldv,current_column) = 0.0_rk8
+#else
+          v(1:ldv,current_column) = 0.0_rk4
+#endif
+          call qr_pdlarfg_copy_1dcomm_&
+&PRECISION &
+(obj,a(1,current_column),incx, &
+                                       v(1,current_column),1, &
+                                       m,baseidx,idx,mb,1,mpicomm)
+
+          ! initialize t matrix part
+          t(current_column,current_column) = tau(current_column)
+
+          actualrank = 1
+
+        else if (rank .eq. 2) then
+#ifdef USE_ASSUMED_SIZE_QR
+          call qr_pdlarfg2_1dcomm_ref_&
+&PRECISION &
+(obj,a(1,current_column),lda,tau(current_column), &
+                                         t(current_column,current_column),ldt,v(1,current_column),ldv, &
+                                        baseidx,work,lwork,m,idx,mb,PQRPARAM,1,mpicomm,actualrank)
+
+#else
+          call qr_pdlarfg2_1dcomm_ref_&
+&PRECISION &
+(obj,a(1,current_column),lda,tau(current_column), &
+                                         t(current_column,current_column),ldt,v(1,current_column),ldv, &
+                                        baseidx,work,lwork,m,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
+#endif
+        else
+#ifdef USE_ASSUMED_SIZE_QR
+          call qr_pdlarfgk_1dcomm_&
+&PRECISION &
+(obj,a(1,current_column),lda,tau(current_column), &
+                                     t(current_column,current_column),ldt,v(1,current_column),ldv, &
+                                     baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank)
+
+#else
+          call qr_pdlarfgk_1dcomm_&
+&PRECISION &
+(obj,a(1,current_column),lda,tau(current_column), &
+                                     t(current_column,current_column),ldt,v(1,current_column),ldv, &
+                                     baseidx,work,lwork,m,rank,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
+#endif
+        end if
+
+        blockheuristic(actualrank) = blockheuristic(actualrank) + 1
+
+        ! the blocked decomposition versions already updated their non
+        ! decomposed parts using their information after communication
+        update_cols = decomposition_cols - rank
+        decomposition_cols = decomposition_cols - actualrank
+
+        ! needed for incremental update
+        nextrank = min(n-(lastcol-decomposition_cols+1)+1,maxrank,rowidx-(lastcol-decomposition_cols+1)+1)
+
+        if (current_column .gt. 1) then
+          idx = rowidx-icol+1
+
+          if (updatemode .eq. ichar('I')) then
+            ! incremental update + merging
+            call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,nextrank-(rank-actualrank),n-(current_column+rank-1),actualrank, &
+                                          v(1,current_column+(rank-actualrank)),ldv, &
+                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
+                                          a(1,current_column-nextrank+(rank-actualrank)),lda,baseidx,rev,updatemode,&
+                                          mpicomm,work,lwork)
+          else
+            ! full update + merging
+            call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,update_cols,n-(current_column+rank-1),actualrank, &
+                                          v(1,current_column+(rank-actualrank)),ldv, &
+                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
+                                          a(1,1),lda,baseidx,rev,updatemode,mpicomm,work,lwork)
+          end if
+        else
+          call qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,0,n-(current_column+rank-1),actualrank, &
+              v(1,current_column+(rank-actualrank)), &
+                                          ldv, &
+                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
+                                           a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
+        end if
+
+      end do
+      call obj%timer%stop("qr_pdgeqr2_1dcomm_&
+          &PRECISION&
+          &")
+    end subroutine
+
+    ! incx == 1: column major
+    ! incx != 1: row major
+    subroutine qr_pdlarfg_1dcomm_&
+&PRECISION &
+(obj,x,incx,tau,work,lwork,n,idx,nb,hgmode,rev,communicator)
+
+      use precision
+      !use elpa1_impl !check this
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+      INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2, eps_ = 3
+
+      ! input variables (local)
+      integer(kind=ik)               :: incx,lwork,hgmode
+      real(kind=C_DATATYPE_KIND)                  :: x(*),work(*)
+
+      ! input variables (global)
+      integer(kind=ik)               :: communicator,nb,idx,n,rev
+
+      ! output variables (global)
+      real(kind=C_DATATYPE_KIND)                  :: tau
+
+      ! local scalars
+      integer(kind=ik)               :: mpirank,mpiprocs,mpirank_top
+      integer(kind=MPI_KIND)         :: mpirankMPI, mpiprocsMPI
+      integer(kind=MPI_KIND)         :: mpierr
+      integer(kind=ik)               :: sendsize,recvsize
+      integer(kind=ik)               :: local_size,local_offset,baseoffset
+      integer(kind=ik)               :: topidx,top,iproc
+      real(kind=C_DATATYPE_KIND)                  :: alpha,xnorm,dot,xf
+
+      ! external functions
+#ifdef DOUBLE_PRECISION_REAL
+      real(kind=C_DATATYPE_KIND), external        :: ddot,dlapy2,dnrm2
+#else
+      real(kind=C_DATATYPE_KIND), external        :: sdot,slapy2,snrm2
+#endif
+      external                       :: dscal
+
+      ! intrinsic
+!      intrinsic sign
+      call obj%timer%start("qr_pdlarfg_1dcomm_&
+          &PRECISION&
+          &")
+      if (idx .le. 1) then
+        tau = 0.0d0
+      call obj%timer%stop("qr_pdlarfg_1dcomm_&
+          &PRECISION&
+          &")
+        return
+       end if
+      call MPI_Comm_rank(int(communicator,kind=MPI_KIND) , mpirankMPI, mpierr)
+      call MPI_Comm_size(int(communicator,kind=MPI_KIND) , mpiprocsMPI, mpierr)
+ 
+      mpirank = int(mpirankMPI, kind=c_int)
+      mpiprocs = int(mpiprocsMPI, kind=c_int)
+
+      ! calculate expected work size and store in work(1)
+      if (hgmode .eq. ichar('s')) then
+        ! allreduce (MPI_SUM)
+        sendsize = 2
+        recvsize = sendsize
+      else if (hgmode .eq. ichar('x')) then
+        ! alltoall
+        sendsize = mpiprocs*2
+        recvsize = sendsize
+      else if (hgmode .eq. ichar('g')) then
+        ! allgather
+        sendsize = 2
+        recvsize = mpiprocs*sendsize
+      else
+        ! no exchange at all (benchmarking)
+        sendsize = 2
+        recvsize = sendsize
+      end if
+
+      if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1) = real(sendsize + recvsize,kind=C_DATATYPE_KIND)
+#else
+        work(1) = real(sendsize + recvsize,kind=rk4)
+#endif
+
+      call obj%timer%stop("qr_pdlarfg_1dcomm_&
+          &PRECISION&
+          &")
+        return
+      end if
+
+      ! Processor id for global index of top element
+      mpirank_top = MOD((idx-1)/nb,mpiprocs)
+      if (mpirank .eq. mpirank_top) then
+        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
+        top = 1+(topidx-1)*incx
+      end if
+
+      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
+                        local_size,baseoffset,local_offset)
+
+      local_offset = local_offset * incx
+
+      ! calculate and exchange information
+      if (hgmode .eq. ichar('s')) then
+        if (mpirank .eq. mpirank_top) then
+          alpha = x(top)
+        else
+#ifdef DOUBLE_PRECISION_REAL
+          alpha = 0.0_rk8
+#else
+          alpha = 0.0_rk4
+#endif
+        end if
+#ifdef DOUBLE_PRECISION_REAL
+        dot = ddot(local_size, &
+                     x(local_offset), incx, &
+                     x(local_offset), incx)
+#else
+        dot = sdot(local_size, &
+                     x(local_offset), incx, &
+                     x(local_offset), incx)
+#endif
+        work(1) = alpha
+        work(2) = dot
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+        call mpi_allreduce(work(1),work(sendsize+1), &
+                             int(sendsize,kind=MPI_KIND), mpi_real8, mpi_sum, &
+                             int(communicator,kind=MPI_KIND), mpierr)
+#else
+        call mpi_allreduce(work(1),work(sendsize+1), &
+                             int(sendsize,kind=MPI_KIND), mpi_real4, mpi_sum, &
+                             int(communicator,kind=MPI_KIND), mpierr)
+#endif
+
+#else
+        work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
+#endif
+        alpha = work(sendsize+1)
+        xnorm = sqrt(work(sendsize+2))
+      else if (hgmode .eq. ichar('x')) then
+        if (mpirank .eq. mpirank_top) then
+          alpha = x(top)
+        else
+#ifdef DOUBLE_PRECISION_REAL
+          alpha = 0.0_rk8
+#else
+          alpha = 0.0_rk4
+#endif
+        end if
+#ifdef DOUBLE_PRECISION_REAL
+        xnorm = dnrm2(local_size, x(local_offset), incx)
+#else
+        xnorm = snrm2(local_size, x(local_offset), incx)
+#endif
+        do iproc=0,mpiprocs-1
+          work(2*iproc+1) = alpha
+          work(2*iproc+2) = xnorm
+        end do
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+        call mpi_alltoall(work(1), 2_MPI_KIND, mpi_real8, &
+                            work(sendsize+1), 2_MPI_KIND, mpi_real8, &
+                            int(communicator,kind=MPI_KIND), mpierr)
+#else
+        call mpi_alltoall(work(1), 2_MPI_KIND, mpi_real4, &
+                            work(sendsize+1), 2_MPI_KIND, mpi_real4, &
+                            int(communicator,kind=MPI_KIND), mpierr)
+#endif
+
+#else
+        work(sendsize+1:sendsize+1+2-1) = work(1:2)
+#endif
+        ! extract alpha value
+        alpha = work(sendsize+1+mpirank_top*2)
+
+        ! copy norm parts of buffer to beginning
+        do iproc=0,mpiprocs-1
+          work(iproc+1) = work(sendsize+1+2*iproc+1)
+        end do
+
+#ifdef DOUBLE_PRECISION_REAL
+        xnorm = dnrm2(mpiprocs, work(1), 1)
+#else
+        xnorm = snrm2(mpiprocs, work(1), 1)
+#endif
+      else if (hgmode .eq. ichar('g')) then
+        if (mpirank .eq. mpirank_top) then
+          alpha = x(top)
+        else
+#ifdef DOUBLE_PRECISION_REAL
+          alpha = 0.0_rk8
+#else
+          alpha = 0.0_rk4
+#endif
+        end if
+#ifdef DOUBLE_PRECISION_REAL
+        xnorm = dnrm2(local_size, x(local_offset), incx)
+#else
+        xnorm = snrm2(local_size, x(local_offset), incx)
+#endif
+        work(1) = alpha
+        work(2) = xnorm
+
+        ! allgather
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+        call mpi_allgather(work(1), int(sendsize,kind=MPI_KIND), mpi_real8, &
+                            work(sendsize+1), int(sendsize,kind=MPI_KIND), mpi_real8, &
+                            int(communicator,kind=MPI_KIND), mpierr)
+#else
+        call mpi_allgather(work(1), int(sendsize,kind=MPI_KIND), mpi_real4, &
+                            work(sendsize+1), int(sendsize,kind=MPI_KIND), mpi_real4, &
+                            int(communicator,kind=MPI_KIND), mpierr)
+#endif
+
+#else
+       work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
+#endif
+        ! extract alpha value
+        alpha = work(sendsize+1+mpirank_top*2)
+
+        ! copy norm parts of buffer to beginning
+        do iproc=0,mpiprocs-1
+          work(iproc+1) = work(sendsize+1+2*iproc+1)
+        end do
+#ifdef DOUBLE_PRECISION_REAL
+        xnorm = dnrm2(mpiprocs, work(1), 1)
+#else
+        xnorm = snrm2(mpiprocs, work(1), 1)
+#endif
+      else
+        ! dnrm2
+#ifdef DOUBLE_PRECISION_REAL
+        xnorm = dnrm2(local_size, x(local_offset), incx)
+#else
+        xnorm = snrm2(local_size, x(local_offset), incx)
+#endif
+        if (mpirank .eq. mpirank_top) then
+          alpha = x(top)
+        else
+#ifdef DOUBLE_PRECISION_REAL
+          alpha = 0.0_rk8
+#else
+          alpha = 0.0_rk4
+#endif
+        end if
+
+        ! no exchange at all (benchmarking)
+#ifdef DOUBLE_PRECISION_REAL
+        xnorm = 0.0_rk8
+#else
+        xnorm = 0.0_rk4
+#endif
+      end if
+
+      !print *,'ref hg:', idx,xnorm,alpha
+      !print *,x(1:n)
+
+      ! calculate householder information
+#ifdef DOUBLE_PRECISION_REAL
+      if (xnorm .eq. 0.0_rk8) then
+        ! H = I
+
+        tau = 0.0_rk8
+#else
+      if (xnorm .eq. 0.0_rk4) then
+        ! H = I
+
+        tau = 0.0_rk4
+#endif
+      else
+        ! General case
+        call hh_transform_real_&
+&PRECISION &
+(obj,alpha,xnorm**2,xf,tau, .false.)
+        if (mpirank .eq. mpirank_top) then
+          x(top) = alpha
+        end if
+#ifdef DOUBLE_PRECISION_REAL
+        call dscal(local_size, xf, &
+                     x(local_offset), incx)
+#else
+        call sscal(local_size, xf, &
+                     x(local_offset), incx)
+#endif
+
+        ! TODO: reimplement norm rescale method of
+        ! original PDLARFG using mpi?
+
+      end if
+
+      ! useful for debugging
+      !print *,'hg:mpirank,idx,beta,alpha:',mpirank,idx,beta,alpha,1.0d0/(beta+alpha),tau
+      !print *,x(1:n)
+      call obj%timer%stop("qr_pdlarfg_1dcomm_&
+          &PRECISION&
+          &")
+    end subroutine
+
+    subroutine qr_pdlarfg2_1dcomm_ref_&
+&PRECISION &
+(obj,a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,idx,mb,PQRPARAM,rev,mpicomm,actualk)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+      INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
+      ! input variables (local)
+      integer(kind=ik)               :: lda,lwork,ldv,ldt
+      real(kind=C_DATATYPE_KIND)                  :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
+
+      ! input variables (global)
+      integer(kind=ik)               :: m,idx,baseidx,mb,rev,mpicomm
+#ifdef USE_ASSUMED_SIZE_QR
+      integer(kind=ik)               :: PQRPARAM(*)
+#else
+      integer(kind=ik)               :: PQRPARAM(:)
+#endif
+      ! output variables (global)
+      integer(kind=ik)               :: actualk
+
+      ! derived input variables from QR_PQRPARAM
+      integer(kind=ik)               :: eps
+
+      ! local scalars
+      real(kind=C_DATATYPE_KIND)                  :: dseedwork_size(1)
+      integer(kind=ik)               :: seedwork_size,seed_size
+      integer(kind=ik)               :: seedwork_offset,seed_offset
+      logical                        :: accurate
+      call obj%timer%start("qr_pdlarfg2_1dcomm_&
+          &PRECISION&
+          &")
+
+      call qr_pdlarfg2_1dcomm_seed_&
+&PRECISION &
+(obj,a,lda,dseedwork_size(1),-1,work,m,mb,idx,rev,mpicomm)
+      seedwork_size = int(dseedwork_size(1))
+      seed_size = seedwork_size
+
+      if (lwork .eq. -1) then
+        work(1) = seedwork_size + seed_size
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_&
+          &PRECISION&
+          &")
+
+        return
+      end if
+
+      seedwork_offset = 1
+      seed_offset = seedwork_offset + seedwork_size
+
+      eps = PQRPARAM(3)
+
+      ! check for border cases (only a 2x2 matrix left)
+      if (idx .le. 1) then
+#ifdef DOUBLE_PRECISION_REAL
+        tau(1:2) = 0.0_rk8
+         t(1:2,1:2) = 0.0_rk8
+#else
+        tau(1:2) = 0.0_rk4
+         t(1:2,1:2) = 0.0_rk4
+#endif
+
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_&
+          &PRECISION&
+          &")
+
+         return
+      end if
+
+      call qr_pdlarfg2_1dcomm_seed_&
+&PRECISION &
+(obj,a,lda,work(seedwork_offset),lwork,work(seed_offset),m,mb,idx,rev,mpicomm)
+
+      if (eps .gt. 0) then
+        accurate = qr_pdlarfg2_1dcomm_check_&
+&PRECISION &
+(obj,work(seed_offset),eps)
+      else
+        accurate = .true.
+      end if
+
+      call qr_pdlarfg2_1dcomm_vector_&
+&PRECISION &
+(obj,a(1,2),1,tau(2),work(seed_offset), &
+                                          m,mb,idx,0,1,mpicomm)
+
+      call qr_pdlarfg_copy_1dcomm_&
+&PRECISION &
+(obj,a(1,2),1, &
+                                       v(1,2),1, &
+                                       m,baseidx,idx,mb,1,mpicomm)
+
+      call qr_pdlarfg2_1dcomm_update_&
+&PRECISION &
+(obj,v(1,2),1,baseidx,a(1,1),lda,work(seed_offset),m,idx,mb,rev,mpicomm)
+
+      ! check for 2x2 matrix case => only one householder Vector will be
+      ! generated
+      if (idx .gt. 2) then
+        if (accurate .eqv. .true.) then
+          call qr_pdlarfg2_1dcomm_vector_&
+&PRECISION &
+(obj,a(1,1),1,tau(1),work(seed_offset), &
+                                                  m,mb,idx-1,1,1,mpicomm)
+
+          call qr_pdlarfg_copy_1dcomm_&
+&PRECISION &
+(obj,a(1,1),1, &
+                                               v(1,1),1, &
+                                               m,baseidx,idx-1,mb,1,mpicomm)
+
+          ! generate fuse element
+          call qr_pdlarfg2_1dcomm_finalize_tmatrix_&
+&PRECISION &
+(obj,work(seed_offset),tau,t,ldt)
+
+          actualk = 2
+        else
+#ifdef DOUBLE_PRECISION_REAL
+          t(1,1) = 0.0_rk8
+          t(1,2) = 0.0_rk8
+#else
+          t(1,1) = 0.0_rk4
+          t(1,2) = 0.0_rk4
+#endif
+          t(2,2) = tau(2)
+
+          actualk = 1
+        end if
+      else
+#ifdef DOUBLE_PRECISION_REAL
+        t(1,1) = 0.0_rk8
+        t(1,2) = 0.0_rk8
+#else
+        t(1,1) = 0.0_rk4
+        t(1,2) = 0.0_rk4
+#endif
+        t(2,2) = tau(2)
+
+        ! no more vectors to create
+#ifdef DOUBLE_PRECISION_REAL
+        tau(1) = 0.0_rk8
+#else
+        tau(1) = 0.0_rk4
+#endif
+
+        actualk = 2
+
+        !print *,'rank2: no more data'
+      end if
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_&
+          &PRECISION&
+          &")
+
+    end subroutine
+
+    subroutine qr_pdlarfg2_1dcomm_seed_&
+&PRECISION &
+(obj,a,lda,work,lwork,seed,n,nb,idx,rev,mpicomm)
+      use precision
+      !use elpa1_impl ! check this
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input variables (local)
+      integer(kind=ik)        :: lda,lwork
+      real(kind=C_DATATYPE_KIND)           :: a(lda,*),work(*),seed(*)
+
+      ! input variables (global)
+      integer(kind=ik)        :: n,nb,idx,rev,mpicomm
+
+      ! output variables (global)
+
+      ! external functions
+#ifdef DOUBLE_PRECISION_REAL
+      real(kind=C_DATATYPE_KIND), external :: ddot
+#else
+      real(kind=C_DATATYPE_KIND), external :: sdot
+#endif
+      ! local scalars
+      real(kind=C_DATATYPE_KIND)           :: top11,top21,top12,top22
+      real(kind=C_DATATYPE_KIND)           :: dot11,dot12,dot22
+      integer(kind=ik)        :: mpirank, mpiprocs
+      integer(kind=MPI_KIND)  :: mpirankMPI, mpiprocsMPI, mpierr
+      integer(kind=ik)        :: mpirank_top11,mpirank_top21
+      integer(kind=ik)        :: top11_offset,top21_offset
+      integer(kind=ik)        :: baseoffset
+      integer(kind=ik)        :: local_offset1,local_size1
+      integer(kind=ik)        :: local_offset2,local_size2
+
+      call obj%timer%start("qr_pdlarfg2_1dcomm_seed_&
+          &PRECISION&
+          &")
+
+      if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1) = 8.0_rk8
+#else
+        work(1) = 8.0_rk4
+#endif
+
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_seed_&
+          &PRECISION&
+          &")
+        return
+      end if
+      call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+      call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+      mpirank = int(mpirankMPI, kind=c_int)
+      mpiprocs = int(mpiprocsMPI, kind=c_int)
+
+      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
+                                local_size1,baseoffset,local_offset1)
+
+      call local_size_offset_1d(n,nb,idx,idx-2,rev,mpirank,mpiprocs, &
+                                local_size2,baseoffset,local_offset2)
+
+      mpirank_top11 = MOD((idx-1)/nb,mpiprocs)
+      mpirank_top21 = MOD((idx-2)/nb,mpiprocs)
+
+      top11_offset = local_index(idx,mpirank_top11,mpiprocs,nb,0)
+      top21_offset = local_index(idx-1,mpirank_top21,mpiprocs,nb,0)
+
+      if (mpirank_top11 .eq. mpirank) then
+        top11 = a(top11_offset,2)
+        top12 = a(top11_offset,1)
+      else
+#ifdef DOUBLE_PRECISION_REAL
+        top11 = 0.0_rk8
+        top12 = 0.0_rk8
+#else
+        top11 = 0.0_rk4
+        top12 = 0.0_rk4
+#endif
+      end if
+
+      if (mpirank_top21 .eq. mpirank) then
+        top21 = a(top21_offset,2)
+        top22 = a(top21_offset,1)
+      else
+#ifdef DOUBLE_PRECISION_REAL
+        top21 = 0.0_rk8
+        top22 = 0.0_rk8
+#else
+        top21 = 0.0_rk4
+        top22 = 0.0_rk4
+#endif
+      end if
+
+      ! calculate 3 dot products
+#ifdef DOUBLE_PRECISION_REAL
+      dot11 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
+      dot12 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
+      dot22 = ddot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)
+#else
+      dot11 = sdot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
+      dot12 = sdot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
+      dot22 = sdot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)
+#endif
+      ! store results in work buffer
+      work(1) = top11
+      work(2) = dot11
+      work(3) = top12
+      work(4) = dot12
+      work(5) = top21
+      work(6) = top22
+      work(7) = dot22
+#ifdef DOUBLE_PRECISION_REAL
+      work(8) = 0.0_rk8! fill up buffer
+#else
+      work(8) = 0.0_rk4! fill up buffer
+#endif
+      ! exchange partial results
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+      call mpi_allreduce(work, seed, 8_MPI_KIND, mpi_real8, mpi_sum, &
+                         int(mpicomm,kind=MPI_KIND), mpierr)
+#else
+      call mpi_allreduce(work, seed, 8_MPI_KIND, mpi_real4, mpi_sum, &
+                         int(mpicomm,kind=MPI_KIND), mpierr)
+#endif
+
+#else
+      seed(1:8) = work(1:8)
+#endif
+
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_seed_&
+          &PRECISION&
+          &")
+    end subroutine
+
+    logical function qr_pdlarfg2_1dcomm_check_&
+&PRECISION &
+(obj,seed,eps)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+
+      ! input variables
+      real(kind=C_DATATYPE_KIND)    ::  seed(*)
+      integer(kind=ik) :: eps
+
+      ! local scalars
+      real(kind=C_DATATYPE_KIND)    :: epsd,first,second,first_second,estimate
+      logical          :: accurate
+      real(kind=C_DATATYPE_KIND)    :: dot11,dot12,dot22
+      real(kind=C_DATATYPE_KIND)    :: top11,top12,top21,top22
+      call obj%timer%start("qr_pdlarfg2_1dcomm_check_&
+          &PRECISION&
+          &")
+
+      EPSD = EPS
+
+      top11 = seed(1)
+      dot11 = seed(2)
+      top12 = seed(3)
+      dot12 = seed(4)
+
+      top21 = seed(5)
+      top22 = seed(6)
+      dot22 = seed(7)
+
+      ! reconstruct the whole inner products
+      ! (including squares of the top elements)
+      first = dot11 + top11*top11
+      second = dot22 + top22*top22 + top12*top12
+      first_second = dot12 + top11*top12
+
+      ! zero Householder Vector (zero norm) case
+#ifdef DOUBLE_PRECISION_REAL
+      if (first*second .eq. 0.0_rk8) then
+#else
+      if (first*second .eq. 0.0_rk4) then
+#endif
+        qr_pdlarfg2_1dcomm_check_&
+&PRECISION &
+ = .false.
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_check_&
+          &PRECISION&
+          &")
+
+        return
+      end if
+
+      estimate = abs((first_second*first_second)/(first*second))
+
+      !print *,'estimate:',estimate
+
+      ! if accurate the following check holds
+#ifdef DOUBLE_PRECISION_REAL
+      accurate = (estimate .LE. (epsd/(1.0_rk8+epsd)))
+#else
+      accurate = (estimate .LE. (epsd/(1.0_rk4+epsd)))
+#endif
+      qr_pdlarfg2_1dcomm_check_&
+&PRECISION &
+ = accurate
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_check_&
+          &PRECISION&
+          &")
+
+    end function
+
+    ! id=0: first Vector
+    ! id=1: second Vector
+    subroutine qr_pdlarfg2_1dcomm_vector_&
+&PRECISION &
+(obj,x,incx,tau,seed,n,nb,idx,id,rev,mpicomm)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input variables (local)
+      integer(kind=ik)        :: incx
+      real(kind=C_DATATYPE_KIND)           :: x(*),seed(*),tau
+
+      ! input variables (global)
+      integer(kind=ik)        :: n,nb,idx,id,rev,mpicomm
+
+      ! output variables (global)
+
+      ! external functions
+#ifdef DOUBLE_PRECISION_REAL
+      real(kind=C_DATATYPE_KIND), external :: dlapy2
+      external                :: dscal
+#else
+      real(kind=rk4), external :: slapy2
+      external                :: sscal
+#endif
+      ! local scalars
+      integer(kind=ik)        :: mpirank,mpirank_top,mpiprocs
+      integer(kind=MPI_KIND)  :: mpierr, mpirankMPI, mpiprocsMPI
+      real(kind=C_DATATYPE_KIND)           :: alpha,dot,beta,xnorm
+      integer(kind=ik)        :: local_size,baseoffset,local_offset,top,topidx
+      call obj%timer%start("qr_pdlarfg2_1dcomm_vector_&
+          &PRECISION&
+          &")
+
+      call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+      call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+      mpirank = int(mpirankMPI,kind=c_int)
+      mpiprocs = int(mpiprocsMPI,kind=c_int)
+      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
+                                    local_size,baseoffset,local_offset)
+
+      local_offset = local_offset * incx
+
+      ! Processor id for global index of top element
+      mpirank_top = MOD((idx-1)/nb,mpiprocs)
+      if (mpirank .eq. mpirank_top) then
+        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
+        top = 1+(topidx-1)*incx
+      else
+        top = -99
+        stop
+      end if
+
+      alpha = seed(id*5+1)
+      dot = seed(id*5+2)
+
+      xnorm = sqrt(dot)
+#ifdef DOUBLE_PRECISION_REAL
+      if (xnorm .eq. 0.0_rk8) then
+        ! H = I
+
+        tau = 0.0_rk8
+#else
+      if (xnorm .eq. 0.0_rk4) then
+        ! H = I
+
+        tau = 0.0_rk4
+#endif
+      else
+        ! General case
+#ifdef DOUBLE_PRECISION_REAL
+        beta = sign(dlapy2(alpha, xnorm), alpha)
+#else
+        beta = sign(slapy2(alpha, xnorm), alpha)
+#endif
+        tau = (beta+alpha) / beta
+
+        !print *,'hg2',tau,xnorm,alpha
+#ifdef DOUBLE_PRECISION_REAL
+        call dscal(local_size, 1.0_rk8/(beta+alpha), &
+                   x(local_offset), incx)
+#else
+        call sscal(local_size, 1.0_rk4/(beta+alpha), &
+                   x(local_offset), incx)
+#endif
+
+        ! TODO: reimplement norm rescale method of
+        ! original PDLARFG using mpi?
+
+        if (mpirank .eq. mpirank_top) then
+          x(top) = -beta
+        end if
+
+        seed(8) = beta
+      end if
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_vector_&
+          &PRECISION&
+          &")
+
+    end subroutine
+
+    subroutine qr_pdlarfg2_1dcomm_update_&
+&PRECISION &
+(obj,v,incv,baseidx,a,lda,seed,n,idx,nb,rev,mpicomm)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input variables (local)
+      integer(kind=ik)   :: incv,lda
+      real(kind=C_DATATYPE_KIND)      :: v(*),a(lda,*),seed(*)
+
+      ! input variables (global)
+      integer(kind=ik)   :: n,baseidx,idx,nb,rev,mpicomm
+
+      ! output variables (global)
+
+      ! external functions
+      external daxpy
+
+      ! local scalars
+      integer(kind=ik)   :: mpirank,mpiprocs
+      integer(kind=MPI_KIND)   :: mpirankMPI, mpiprocsMPI, mpierr
+      integer(kind=ik)   :: local_size,local_offset,baseoffset
+      real(kind=C_DATATYPE_KIND)      :: z,coeff,beta
+      real(kind=C_DATATYPE_KIND)      :: dot11,dot12,dot22
+      real(kind=C_DATATYPE_KIND)      :: top11,top12,top21,top22
+      call obj%timer%start("qr_pdlarfg2_1dcomm_update_&
+          &PRECISION&
+          &")
+
+      call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+      call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+      mpirank = int(mpirankMPI,kind=c_int)
+      mpiprocs = int(mpiprocsMPI,kind=c_int)
+      ! seed should be updated by previous householder generation
+      ! Update inner product of this column and next column Vector
+      top11 = seed(1)
+      dot11 = seed(2)
+      top12 = seed(3)
+      dot12 = seed(4)
+
+      top21 = seed(5)
+      top22 = seed(6)
+      dot22 = seed(7)
+      beta = seed(8)
+
+      call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
+                                local_size,baseoffset,local_offset)
+      baseoffset = baseoffset * incv
+
+      ! zero Householder Vector (zero norm) case
+#ifdef DOUBLE_PRECISION_REAL
+      if (beta .eq. 0.0_rk8) then
+#else
+      if (beta .eq. 0.0_rk4) then
+#endif
+
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_update_&
+          &PRECISION&
+          &")
+        return
+      end if
+      z = (dot12 + top11 * top12) / beta + top12
+
+      !print *,'hg2 update:',baseidx,idx,mpirank,local_size
+#ifdef DOUBLE_PRECISION_REAL
+      call daxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)
+#else
+      call saxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)
+#endif
+      ! prepare a full dot22 for update
+      dot22 = dot22 + top22*top22
+
+      ! calculate coefficient
+      COEFF = z / (top11 + beta)
+
+      ! update inner product of next Vector
+      dot22 = dot22 - coeff * (2*dot12 - coeff*dot11)
+
+      ! update dot12 value to represent update with first Vector
+      ! (needed for T matrix)
+      dot12 = dot12 - COEFF * dot11
+
+      ! update top element of next Vector
+      top22 = top22 - coeff * top21
+      seed(6) = top22
+
+      ! restore separated dot22 for Vector generation
+      seed(7) = dot22  - top22*top22
+
+      !------------------------------------------------------
+      ! prepare elements for T matrix
+      seed(4) = dot12
+
+      ! prepare dot matrix for fuse element of T matrix
+      ! replace top11 value with -beta1
+      seed(1) = beta
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_update_&
+          &PRECISION&
+          &")
+
+    end subroutine
+
+    ! run this function after second Vector
+    subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix_&
+&PRECISION &
+(obj,seed,tau,t,ldt)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+
+      integer(kind=ik)  :: ldt
+      real(kind=C_DATATYPE_KIND)     :: seed(*),t(ldt,*),tau(*)
+      real(kind=C_DATATYPE_KIND)     :: dot12,beta1,top21,beta2
+      call obj%timer%start("qr_pdlarfg2_1dcomm_finalize_tmatrix_&
+          &PRECISION&
+          &")
+
+      beta1 = seed(1)
+      dot12 = seed(4)
+      top21 = seed(5)
+      beta2 = seed(8)
+
+      !print *,'beta1 beta2',beta1,beta2
+
+      dot12 = dot12 / beta2 + top21
+      dot12 = -(dot12 / beta1)
+
+      t(1,1) = tau(1)
+      t(1,2) = dot12
+      t(2,2) = tau(2)
+      call obj%timer%stop("qr_pdlarfg2_1dcomm_finalize_tmatrix_&
+          &PRECISION&
+          &")
+
+    end subroutine
+
+    subroutine qr_pdlarfgk_1dcomm_&
+&PRECISION &
+(obj,a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,k,idx,mb,PQRPARAM,rev,mpicomm,actualk)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+
+      ! input variables (local)
+      integer(kind=ik)    :: lda,lwork,ldv,ldt
+      real(kind=C_DATATYPE_KIND)       :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
+
+      ! input variables (global)
+      integer(kind=ik)    :: m,k,idx,baseidx,mb,rev,mpicomm
+#ifdef USE_ASSUMED_SIZE_QR
+      integer(kind=ik)    ::PQRPARAM(*)
+#else
+      integer(kind=ik)    :: PQRPARAM(:)
+#endif
+      ! output variables (global)
+      integer(kind=ik)    :: actualk
+
+      ! local scalars
+      integer(kind=ik)    :: ivector
+      real(kind=C_DATATYPE_KIND)       :: pdlarfg_size(1),pdlarf_size(1)
+      real(kind=C_DATATYPE_KIND)       :: pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1)
+      real(kind=C_DATATYPE_KIND)       :: pdlarfgk_1dcomm_update_size(1)
+      integer(kind=ik)    :: seedC_size,seedC_offset
+      integer(kind=ik)    :: seedD_size,seedD_offset
+      integer(kind=ik)    :: work_offset
+      call obj%timer%start("qr_pdlarfgk_1dcomm_&
+          &PRECISION&
+          &")
+
+      seedC_size = k*k
+      seedC_offset = 1
+      seedD_size = k*k
+      seedD_offset = seedC_offset + seedC_size
+      work_offset = seedD_offset + seedD_size
+
+      if (lwork .eq. -1) then
+        call qr_pdlarfg_1dcomm_&
+&PRECISION &
+(obj, a,1,tau(1),pdlarfg_size(1),-1,m,baseidx,mb,PQRPARAM(4),rev,mpicomm)
+
+        call qr_pdlarfl_1dcomm_&
+&PRECISION &
+(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,k,baseidx,mb,rev,mpicomm)
+        call qr_pdlarfgk_1dcomm_seed_&
+&PRECISION &
+(obj,a,lda,baseidx,pdlarfgk_1dcomm_seed_size(1),-1,work,work,m,k,mb,mpicomm)
+#ifdef USE_ASSUMED_SIZE_QR
+        !call qr_pdlarfgk_1dcomm_check_&
+!&PRECISION &
+!(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
+        call qr_pdlarfgk_1dcomm_check_improved_&
+&PRECISION &
+(obj,work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
+#else
+        !call qr_pdlarfgk_1dcomm_check_&
+!&PRECISION &
+!(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
+        call qr_pdlarfgk_1dcomm_check_improved_&
+&PRECISION &
+(obj,work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
+#endif
+        call qr_pdlarfgk_1dcomm_update_&
+&PRECISION &
+(obj,a,lda,baseidx,pdlarfgk_1dcomm_update_size(1), &
+                                              -1,work,work,k,k,1,work,m,mb,rev,mpicomm)
+        work(1) = max(pdlarfg_size(1),pdlarf_size(1),pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1), &
+                        pdlarfgk_1dcomm_update_size(1)) + real(seedC_size + seedD_size, kind=C_DATATYPE_KIND)
+
+      call obj%timer%stop("qr_pdlarfgk_1dcomm_&
+          &PRECISION&
+          &")
+
+        return
+      end if
+
+      call qr_pdlarfgk_1dcomm_seed_&
+&PRECISION &
+(obj,a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
+          work(seedD_offset),m,k,mb,mpicomm)
+#ifdef USE_ASSUMED_SIZE_QR
+      !call qr_pdlarfgk_1dcomm_check_&
+!&PRECISION &
+!(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
+      call qr_pdlarfgk_1dcomm_check_improved_&
+&PRECISION &
+(obj,work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
+
+#else
+      !call qr_pdlarfgk_1dcomm_check_&
+!&PRECISION &
+!(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk)
+      call qr_pdlarfgk_1dcomm_check_improved_&
+&PRECISION &
+(obj,work(seedC_offset),work(seedD_offset), &
+          k,PQRPARAM(:),work(work_offset),lwork,actualk)
+#endif
+      !print *,'possible rank:', actualk
+
+      ! override useful for debugging
+      !actualk = 1
+      !actualk = k
+      !actualk= min(actualk,2)
+      do ivector=1,actualk
+        call qr_pdlarfgk_1dcomm_vector_&
+&PRECISION &
+(obj,a(1,k-ivector+1),1,idx,tau(k-ivector+1), &
+                                          work(seedC_offset),work(seedD_offset),k, &
+                                          ivector,m,mb,rev,mpicomm)
+
+        call qr_pdlarfgk_1dcomm_update_&
+&PRECISION &
+(obj,a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
+                                          work(seedD_offset),k,actualk,ivector,tau, &
+                                          m,mb,rev,mpicomm)
+
+        call qr_pdlarfg_copy_1dcomm_&
+&PRECISION &
+(obj,a(1,k-ivector+1),1, &
+                                       v(1,k-ivector+1),1, &
+                                       m,baseidx,idx-ivector+1,mb,1,mpicomm)
+      end do
+
+      ! generate final T matrix and convert preliminary tau values into real ones
+      call qr_pdlarfgk_1dcomm_generateT_&
+&PRECISION &
+(obj,work(seedC_offset),work(seedD_offset),k,actualk,tau,t,ldt)
+
+      call obj%timer%stop("qr_pdlarfgk_1dcomm_&
+          &PRECISION&
+          &")
+    end subroutine
+
+    subroutine qr_pdlarfgk_1dcomm_seed_&
+&PRECISION &
+(obj,a,lda,baseidx,work,lwork,seedC,seedD,m,k,mb,mpicomm)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+
+      ! input variables (local)
+      integer(kind=ik)   :: lda,lwork
+      real(kind=C_DATATYPE_KIND)      :: a(lda,*), work(*)
+
+      ! input variables (global)
+      integer(kind=ik)   :: m,k,baseidx,mb,mpicomm
+      real(kind=C_DATATYPE_KIND)      :: seedC(k,*),seedD(k,*)
+
+      ! output variables (global)
+
+      ! derived input variables from QR_PQRPARAM
+
+      ! local scalars
+      integer(kind=ik)   :: mpirank,mpiprocs,mpirank_top
+      integer(kind=MPI_KIND)   :: mpierr,mpirankMPI,mpiprocsMPI
+      integer(kind=ik)   :: icol,irow,lidx,remsize
+      integer(kind=ik)   :: remaining_rank
+
+      integer(kind=ik)   :: C_size,D_size,sendoffset,recvoffset,sendrecv_size
+      integer(kind=ik)   :: localoffset,localsize,baseoffset
+      call obj%timer%start("qr_pdlarfgk_1dcomm_seed_&
+          &PRECISION&
+          &")
+
+      call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+      call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+      mpirank = int(mpirankMPI,kind=c_int)
+      mpiprocs = int(mpiprocsMPI,kind=c_int)
+      C_size = k*k
+      D_size = k*k
+      sendoffset = 1
+      sendrecv_size = C_size+D_size
+      recvoffset = sendoffset + sendrecv_size
+
+      if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1) = real(2*sendrecv_size,kind=C_DATATYPE_KIND)
+#else
+        work(1) = real(2*sendrecv_size,kind=rk4)
+#endif
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_seed_&
+          &PRECISION&
+          &")
+
+        return
+      end if
+
+      ! clear buffer
+#ifdef DOUBLE_PRECISION_REAL
+      work(sendoffset:sendoffset+sendrecv_size-1)=0.0_rk8
+#else
+      work(sendoffset:sendoffset+sendrecv_size-1)=0.0_rk4
+#endif
+      ! collect C part
+      do icol=1,k
+
+        remaining_rank = k
+        do while (remaining_rank .gt. 0)
+          irow = k - remaining_rank + 1
+          lidx = baseidx - remaining_rank + 1
+
+          ! determine chunk where the current top element is located
+          mpirank_top = MOD((lidx-1)/mb,mpiprocs)
+
+          ! limit max number of remaining elements of this chunk to the block
+          ! distribution parameter
+          remsize = min(remaining_rank,mb)
+
+          ! determine the number of needed elements in this chunk
+          call local_size_offset_1d(lidx+remsize-1,mb, &
+                                    lidx,lidx,0, &
+                                    mpirank_top,mpiprocs, &
+                                    localsize,baseoffset,localoffset)
+
+          !print *,'local rank',localsize,localoffset
+
+          if (mpirank .eq. mpirank_top) then
+            ! copy elements to buffer
+            work(sendoffset+(icol-1)*k+irow-1:sendoffset+(icol-1)*k+irow-1+localsize-1) &
+                          = a(localoffset:localoffset+remsize-1,icol)
+          end if
+
+          ! jump to next chunk
+          remaining_rank = remaining_rank - localsize
+        end do
+      end do
+
+      ! collect D part
+      call local_size_offset_1d(m,mb,baseidx-k,baseidx-k,1, &
+                        mpirank,mpiprocs, &
+                        localsize,baseoffset,localoffset)
+
+      !print *,'localsize',localsize,localoffset
+#ifdef DOUBLE_PRECISION_REAL
+      if (localsize > 0) then
+        call dsyrk("Upper", "Trans", k, localsize, &
+                     1.0_rk8, a(localoffset,1), lda, &
+                     0.0_rk8, work(sendoffset+C_size), k)
+      else
+        work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0_rk8
+      end if
+#else
+      if (localsize > 0) then
+        call ssyrk("Upper", "Trans", k, localsize, &
+                     1.0_rk4, a(localoffset,1), lda, &
+                     0.0_rk4, work(sendoffset+C_size), k)
+      else
+        work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0_rk4
+      end if
+#endif
+
+      ! TODO: store symmetric part more efficiently
+
+      ! allreduce operation on results
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+      call mpi_allreduce(work(sendoffset),work(recvoffset),int(sendrecv_size,kind=MPI_KIND), &
+                         mpi_real8, mpi_sum, int(mpicomm,kind=MPI_KIND), mpierr)
+#else
+      call mpi_allreduce(work(sendoffset),work(recvoffset), int(sendrecv_size,kind=MPI_KIND), &
+                         mpi_real4, mpi_sum, int(mpicomm,kind=MPI_KIND), mpierr)
+#endif
+
+#else
+      work(recvoffset:recvoffset+sendrecv_size-1) = work(sendoffset:sendoffset+sendrecv_size-1)
+#endif
+      ! unpack result from buffer into seedC and seedD
+#ifdef DOUBLE_PRECISION_REAL
+      seedC(1:k,1:k) = 0.0_rk8
+#else
+      seedC(1:k,1:k) = 0.0_rk4
+#endif
+      do icol=1,k
+        seedC(1:k,icol) = work(recvoffset+(icol-1)*k:recvoffset+icol*k-1)
+      end do
+#ifdef DOUBLE_PRECISION_REAL
+      seedD(1:k,1:k) = 0.0_rk8
+#else
+      seedD(1:k,1:k) = 0.0_rk4
+#endif
+      do icol=1,k
+        seedD(1:k,icol) = work(recvoffset+C_size+(icol-1)*k:recvoffset+C_size+icol*k-1)
+      end do
+
+      call obj%timer%stop("qr_pdlarfgk_1dcomm_seed_&
+          &PRECISION&
+          &")
+
+    end subroutine
+
+    ! k is assumed to be larger than two
+    subroutine qr_pdlarfgk_1dcomm_check_improved_&
+&PRECISION &
+(obj,seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input variables (global)
+      integer(kind=ik)   :: k,lwork
+#ifdef USE_ASSUMED_SIZE_QR
+      integer(kind=ik)   :: PQRPARAM(*)
+
+#else
+      integer(kind=ik)   :: PQRPARAM(:)
+#endif
+      real(kind=C_DATATYPE_KIND)      :: seedC(k,*),seedD(k,*),work(k,*)
+
+      ! output variables (global)
+      integer(kind=ik)   :: possiblerank
+
+      ! derived input variables from QR_PQRPARAM
+      integer(kind=ik)   :: eps
+
+      ! local variables
+      integer(kind=ik)   :: i,j,l
+      real(kind=C_DATATYPE_KIND)      :: sum_squares,diagonal_square,epsd,diagonal_root
+      real(kind=C_DATATYPE_KIND)      :: dreverse_matrix_work(1)
+
+      ! external functions
+#ifdef DOUBLE_PRECISION_REAL
+      real(kind=C_DATATYPE_KIND), external :: ddot,dlapy2,dnrm2
+      external                :: dscal
+#else
+      real(kind=rk4), external :: sdot,slapy2,snrm2
+      external                :: sscal
+#endif
+
+      call obj%timer%start("qr_pdlarfgk_1dcomm_check_improved_&
+          &PRECISION&
+          &")
+
+      if (lwork .eq. -1) then
+        call reverse_matrix_local_&
+&PRECISION &
+(1,k,k,work,k,dreverse_matrix_work,-1)
+#ifdef DOUBLE_PRECISION_REAL
+        work(1,1) = real(k*k,kind=C_DATATYPE_KIND) + dreverse_matrix_work(1)
+#else
+        work(1,1) = real(k*k,kind=rk4) + dreverse_matrix_work(1)
+#endif
+
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
+            &PRECISION&
+            &")
+        return
+      end if
+
+      eps = PQRPARAM(3)
+
+      if (eps .eq. 0) then
+        possiblerank = k
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
+            &PRECISION&
+            &")
+        return
+      end if
+#ifdef DOUBLE_PRECISION_REAL
+      epsd = real(eps,kind=C_DATATYPE_KIND)
+#else
+      epsd = real(eps,kind=rk4)
+#endif
+
+      ! build complete inner product from seedC and seedD
+      ! copy seedD to work
+      work(:,1:k) = seedD(:,1:k)
+
+      ! add inner products of seedC to work
+#ifdef DOUBLE_PRECISION_REAL
+      call dsyrk("Upper", "Trans", k, k, &
+                 1.0_rk8, seedC(1,1), k, &
+                 1.0_rk8, work, k)
+#else
+      call ssyrk("Upper", "Trans", k, k, &
+                 1.0_rk4, seedC(1,1), k, &
+                 1.0_rk4, work, k)
+
+#endif
+
+      ! TODO: optimize this part!
+      call reverse_matrix_local_&
+&PRECISION &
+(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
+      call reverse_matrix_local_&
+&PRECISION &
+(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
+
+      ! transpose matrix
+      do i=1,k
+        do j=i+1,k
+          work(i,j) = work(j,i)
+        end do
+      end do
+
+
+      ! do cholesky decomposition
+      i = 0
+      do while ((i .lt. k))
+        i = i + 1
+
+        diagonal_square = abs(work(i,i))
+        diagonal_root  = sqrt(diagonal_square)
+
+        ! zero Householder Vector (zero norm) case
+#ifdef DOUBLE_PRECISION_REAL
+        if ((abs(diagonal_square) .eq. 0.0_rk8) .or. (abs(diagonal_root) .eq. 0.0_rk8)) then
+#else
+        if ((abs(diagonal_square) .eq. 0.0_rk4) .or. (abs(diagonal_root) .eq. 0.0_rk4)) then
+#endif
+          possiblerank = max(i-1,1)
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
+            &PRECISION&
+            &")
+          return
+        end if
+
+        ! check if relative error is bounded for each Householder Vector
+        ! Householder i is stable iff Househoulder i-1 is "stable" and the accuracy criterion
+        ! holds.
+        ! first Householder Vector is considered as "stable".
+
+        do j=i+1,k
+          work(i,j) = work(i,j) / diagonal_root
+          do l=i+1,j
+            work(l,j) = work(l,j) - work(i,j) * work(i,l)
+          end do
+        end do
+        !print *,'cholesky step done'
+
+        ! build sum of squares
+#ifdef DOUBLE_PRECISION_REAL
+        if (i .eq. 1) then
+          sum_squares = 0.0_rk8
+        else
+          sum_squares = ddot(i-1,work(1,i),1,work(1,i),1)
+        end if
+#else
+        if (i .eq. 1) then
+          sum_squares = 0.0_rk4
+        else
+          sum_squares = sdot(i-1,work(1,i),1,work(1,i),1)
+        end if
+#endif
+        if (sum_squares .ge. (epsd * diagonal_square)) then
+          possiblerank = max(i-1,1)
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
+            &PRECISION&
+            &")
+          return
+        end if
+      end do
+
+      possiblerank = i
+      !print *,'possible rank', possiblerank
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_improved_&
+            &PRECISION&
+            &")
+
+    end subroutine
+
+    ! TODO: zero Householder Vector (zero norm) case
+    ! - check alpha values as well (from seedC)
+    subroutine qr_pdlarfgk_1dcomm_check_&
+&PRECISION &
+(obj,seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
+      use precision
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+
+      ! input variables (local)
+
+      ! input variables (global)
+      integer(kind=ik)   :: k,lwork
+#ifdef USE_ASSUMED_SIZE_QR
+      integer(kind=ik)   :: PQRPARAM(*)
+#else
+      integer(kind=ik)   :: PQRPARAM(:)
+#endif
+      real(kind=C_DATATYPE_KIND)      :: seedC(k,*),seedD(k,*),work(k,*)
+
+      ! output variables (global)
+      integer(kind=ik)   :: possiblerank
+
+      ! derived input variables from QR_PQRPARAM
+      integer(kind=ik)   :: eps
+
+      ! local scalars
+      integer(kind=ik)   :: icol,isqr,iprod
+      real(kind=C_DATATYPE_KIND)      :: epsd,sum_sqr,sum_products,diff,temp,ortho,ortho_sum
+      real(kind=C_DATATYPE_KIND)      :: dreverse_matrix_work(1)
+        call obj%timer%start("qr_pdlarfgk_1dcomm_check_&
+            &PRECISION&
+            &")
+      if (lwork .eq. -1) then
+        call reverse_matrix_local_&
+&PRECISION &
+(1,k,k,work,k,dreverse_matrix_work,-1)
+#ifdef DOUBLE_PRECISION_REAL
+        work(1,1) = real(k*k,kind=C_DATATYPE_KIND) + dreverse_matrix_work(1)
+#else
+        work(1,1) = real(k*k,kind=rk4) + dreverse_matrix_work(1)
+#endif
+
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
+            &PRECISION&
+            &")
+
+        return
+      end if
+
+      eps = PQRPARAM(3)
+
+      if (eps .eq. 0) then
+        possiblerank = k
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
+            &PRECISION&
+            &")
+        return
+      end if
+#ifdef DOUBLE_PRECISION_REAL
+      epsd = real(eps,kind=C_DATATYPE_KIND)
+#else
+      epsd = real(eps,kind=rk4)
+#endif
+
+      ! copy seedD to work
+      work(:,1:k) = seedD(:,1:k)
+
+      ! add inner products of seedC to work
+#ifdef DOUBLE_PRECISION_REAL
+      call dsyrk("Upper", "Trans", k, k, &
+                 1.0_rk8, seedC(1,1), k, &
+                 1.0_rk8, work, k)
+#else
+      call ssyrk("Upper", "Trans", k, k, &
+                 1.0_rk4, seedC(1,1), k, &
+                 1.0_rk4, work, k)
+#endif
+
+      ! TODO: optimize this part!
+      call reverse_matrix_local_&
+&PRECISION &
+(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
+      call reverse_matrix_local_&
+&PRECISION &
+(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
+
+      ! transpose matrix
+      do icol=1,k
+        do isqr=icol+1,k
+          work(icol,isqr) = work(isqr,icol)
+        end do
+      end do
+
+      ! work contains now the full inner product of the global (sub-)matrix
+      do icol=1,k
+        ! zero Householder Vector (zero norm) case
+#ifdef DOUBLE_PRECISION_REAL
+        if (abs(work(icol,icol)) .eq. 0.0_rk8) then
+#else
+        if (abs(work(icol,icol)) .eq. 0.0_rk4) then
+#endif
+          !print *,'too small ', icol, work(icol,icol)
+          possiblerank = max(icol,1)
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
+            &PRECISION&
+            &")
+          return
+        end if
+
+#ifdef DOUBLE_PRECISION_REAL
+        sum_sqr = 0.0_rk8
+        do isqr=1,icol-1
+          sum_products = 0.0_rk8
+#else
+        sum_sqr = 0.0_rk4
+        do isqr=1,icol-1
+          sum_products = 0.0_rk4
+#endif
+          do iprod=1,isqr-1
+            sum_products = sum_products + work(iprod,isqr)*work(iprod,icol)
+          end do
+
+          !print *,'divisor',icol,isqr,work(isqr,isqr)
+          temp = (work(isqr,icol) - sum_products)/work(isqr,isqr)
+          work(isqr,icol) = temp
+          sum_sqr = sum_sqr + temp*temp
+        end do
+
+        ! calculate diagonal value
+        diff = work(icol,icol) - sum_sqr
+#ifdef DOUBLE_PRECISION_REAL
+        if (diff .lt. 0.0_rk8) then
+#else
+        if (diff .lt. 0.0_rk4) then
+#endif
+          ! we definitely have a problem now
+          possiblerank = icol-1 ! only decompose to previous column (including)
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
+            &PRECISION&
+            &")
+          return
+        end if
+        work(icol,icol) = sqrt(diff)
+        ! calculate orthogonality
+#ifdef DOUBLE_PRECISION_REAL
+        ortho = 0.0_rk8
+        do isqr=1,icol-1
+          ortho_sum = 0.0_rk8
+#else
+        ortho = 0.0_rk4
+        do isqr=1,icol-1
+          ortho_sum = 0.0_rk4
+#endif
+          do iprod=isqr,icol-1
+            temp = work(isqr,iprod)*work(isqr,iprod)
+            !print *,'ortho ', work(iprod,iprod)
+            temp = temp / (work(iprod,iprod)*work(iprod,iprod))
+            ortho_sum = ortho_sum + temp
+          end do
+          ortho = ortho + ortho_sum * (work(isqr,icol)*work(isqr,icol))
+        end do
+
+        ! ---------------- with division by zero ----------------------- !
+
+        !ortho = ortho / diff;
+
+        ! if current estimate is not accurate enough, the following check holds
+        !if (ortho .gt. epsd) then
+        !    possiblerank = icol-1 ! only decompose to previous column (including)
+        !    return
+        !end if
+
+        ! ---------------- without division by zero ----------------------- !
+
+        ! if current estimate is not accurate enough, the following check holds
+        if (ortho .gt. epsd * diff) then
+          possiblerank = icol-1 ! only decompose to previous column (including)
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
+            &PRECISION&
+            &")
+          return
+        end if
+      end do
+
+      ! if we get to this point, the accuracy condition holds for the whole block
+      possiblerank = k
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_check_&
+            &PRECISION&
+            &")
+      end subroutine
+
+    !sidx: seed idx
+    !k: max rank used during seed phase
+    !rank: actual rank (k >= rank)
+    subroutine qr_pdlarfgk_1dcomm_vector_&
+&PRECISION &
+(obj,x,incx,baseidx,tau,seedC,seedD,k,sidx,n,nb,rev,mpicomm)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input variables (local)
+      integer(kind=ik)  :: incx
+      real(kind=C_DATATYPE_KIND)     :: x(*),tau
+
+      ! input variables (global)
+      integer(kind=ik)  :: n,nb,baseidx,rev,mpicomm,k,sidx
+      real(kind=C_DATATYPE_KIND)     :: seedC(k,*),seedD(k,*)
+
+      ! output variables (global)
+
+      ! external functions
+#ifdef DOUBLE_PRECISION_REAL
+      real(kind=C_DATATYPE_KIND), external :: dlapy2,dnrm2
+      external                :: dscal
+#else
+      real(kind=rk4), external :: slapy2,snrm2
+      external                :: sscal
+#endif
+
+      ! local scalars
+      integer(kind=ik)   :: mpirank,mpirank_top,mpiprocs
+      integer(kind=MPI_KIND) :: mpirankMPI, mpiprocsMPI, mpierr
+      real(kind=C_DATATYPE_KIND)      :: alpha,dot,beta,xnorm
+      integer(kind=ik)   :: local_size,baseoffset,local_offset,top,topidx
+      integer(kind=ik)   :: lidx
+        call obj%timer%start("qr_pdlarfgk_1dcomm_vector_&
+            &PRECISION&
+            &")
+      call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+      call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+      mpirank = int(mpirankMPI,kind=c_int)
+      mpiprocs = int(mpiprocsMPI,kind=c_int)
+
+      lidx = baseidx-sidx+1
+      call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, &
+                        local_size,baseoffset,local_offset)
+
+      local_offset = local_offset * incx
+
+      ! Processor id for global index of top element
+      mpirank_top = MOD((lidx-1)/nb,mpiprocs)
+      if (mpirank .eq. mpirank_top) then
+        topidx = local_index((lidx),mpirank_top,mpiprocs,nb,0)
+        top = 1+(topidx-1)*incx
+      end if
+
+      alpha = seedC(k-sidx+1,k-sidx+1)
+      dot = seedD(k-sidx+1,k-sidx+1)
+      ! assemble actual norm from both seed parts
+#ifdef DOUBLE_PRECISION_REAL
+      xnorm = dlapy2(sqrt(dot), dnrm2(k-sidx,seedC(1,k-sidx+1),1))
+
+      if (xnorm .eq. 0.0_rk8) then
+        tau = 0.0_rk8
+      else
+
+        ! General case
+
+        beta = sign(dlapy2(alpha, xnorm), alpha)
+        ! store a preliminary version of beta in tau
+        tau = beta
+
+        ! update global part
+        call dscal(local_size, 1.0_rk8/(beta+alpha), &
+                     x(local_offset), incx)
+#else
+      xnorm = slapy2(sqrt(dot), snrm2(k-sidx,seedC(1,k-sidx+1),1))
+
+      if (xnorm .eq. 0.0_rk4) then
+        tau = 0.0_rk4
+      else
+
+        ! General case
+
+        beta = sign(slapy2(alpha, xnorm), alpha)
+        ! store a preliminary version of beta in tau
+        tau = beta
+
+        ! update global part
+        call sscal(local_size, 1.0_rk4/(beta+alpha), &
+                     x(local_offset), incx)
+
+#endif
+        ! do not update local part here due to
+        ! dependency of c Vector during update process
+
+        ! TODO: reimplement norm rescale method of
+        ! original PDLARFG using mpi?
+
+        if (mpirank .eq. mpirank_top) then
+          x(top) = -beta
+        end if
+      end if
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_vector_&
+            &PRECISION&
+            &")
+
+    end subroutine
+
+    !k: original max rank used during seed function
+    !rank: possible rank as from check function
+    ! TODO: if rank is less than k, reduce buffersize in such a way
+    ! that only the required entries for the next pdlarfg steps are
+    ! computed
+    subroutine qr_pdlarfgk_1dcomm_update_&
+&PRECISION &
+(obj,a,lda,baseidx,work,lwork,seedC,seedD,k,rank,sidx,tau,n,nb,rev,mpicomm)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! parameter setup
+      INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
+
+      ! input variables (local)
+      integer(kind=ik)            :: lda,lwork
+      real(kind=C_DATATYPE_KIND)               :: a(lda,*),work(*)
+
+      ! input variables (global)
+      integer(kind=ik)            :: k,rank,sidx,n,baseidx,nb,rev,mpicomm
+      real(kind=C_DATATYPE_KIND)               :: beta
+
+      ! output variables (global)
+      real(kind=C_DATATYPE_KIND)               :: seedC(k,*),seedD(k,*),tau(*)
+
+      ! derived input variables from QR_PQRPARAM
+
+      ! local scalars
+      real(kind=C_DATATYPE_KIND)               :: alpha
+      integer(kind=ik)            :: coffset,zoffset,yoffset,voffset,buffersize
+      integer(kind=ik)            :: mpirank,mpiprocs,mpirank_top
+      integer(kind=MPI_KIND)      :: mpirankMPI, mpierr,mpiprocsMPI 
+
+      integer(kind=ik)            :: localsize,baseoffset,localoffset,topidx
+      integer(kind=ik)            :: lidx
+        call obj%timer%start("qr_pdlarfgk_1dcomm_update_&
+            &PRECISION&
+            &")
+      if (lwork .eq. -1) then
+        ! buffer for c,z,y,v
+        work(1) = 4*k
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
+            &PRECISION&
+            &")
+
+        return
+      end if
+
+      ! nothing to update anymore
+      if (sidx .gt. rank) then
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
+            &PRECISION&
+            &")
+        return
+      endif
+      call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+      call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+      mpirank = int(mpirankMPI,kind=c_int)
+      mpiprocs = int(mpiprocsMPI,kind=c_int)
+
+      lidx = baseidx-sidx
+      if (lidx .lt. 1) then
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
+            &PRECISION&
+            &")
+        return
+      endif
+
+      call local_size_offset_1d(n,nb,baseidx,lidx,rev,mpirank,mpiprocs, &
+                                localsize,baseoffset,localoffset)
+
+      coffset = 1
+      zoffset = coffset + k
+      yoffset = zoffset + k
+      voffset = yoffset + k
+      buffersize = k - sidx
+
+      ! finalize tau values
+      alpha = seedC(k-sidx+1,k-sidx+1)
+      beta = tau(k-sidx+1)
+
+      ! zero Householder Vector (zero norm) case
+      !print *,'k update: alpha,beta',alpha,beta
+#ifdef DOUBLE_PRECISION_REAL
+      if ((beta .eq. 0.0_rk8) .or. (alpha .eq. 0.0_rk8))  then
+        tau(k-sidx+1) = 0.0_rk8
+        seedC(k,k-sidx+1) = 0.0_rk8
+#else
+      if ((beta .eq. 0.0_rk4) .or. (alpha .eq. 0.0_rk4))  then
+        tau(k-sidx+1) = 0.0_rk4
+        seedC(k,k-sidx+1) = 0.0_rk4
+#endif
+
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
+            &PRECISION&
+            &")
+        return
+      end if
+
+      tau(k-sidx+1) = (beta+alpha) / beta
+
+      ! ---------------------------------------
+      ! calculate c Vector (extra Vector or encode in seedC/seedD?
+      work(coffset:coffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
+#ifdef DOUBLE_PRECISION_REAL
+      call dgemv("Trans", buffersize+1, buffersize, &
+                 1.0_rk8,seedC(1,1),k,seedC(1,k-sidx+1),1, &
+                 1.0_rk8,work(coffset),1)
+
+      ! calculate z using tau,seedD,seedC and c Vector
+      work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize)
+      call daxpy(buffersize, 1.0_rk8/beta, work(coffset), 1, work(zoffset), 1)
+
+      ! update A1(local copy) and generate part of householder vectors for use
+      call daxpy(buffersize, -1.0_rk8, work(zoffset),1,seedC(k-sidx+1,1),k)
+      call dscal(buffersize, 1.0_rk8/(alpha+beta), seedC(1,k-sidx+1),1)
+      call dger(buffersize, buffersize, -1.0_rk8, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k)
+
+      ! update A global (householder Vector already generated by pdlarfgk)
+      mpirank_top = MOD(lidx/nb,mpiprocs)
+      if (mpirank .eq. mpirank_top) then
+        ! handle first row separately
+        topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0)
+        call daxpy(buffersize,-1.0_rk8,work(zoffset),1,a(topidx,1),lda)
+      end if
+
+      call dger(localsize, buffersize,-1.0_rk8, &
+                a(localoffset,k-sidx+1),1,work(zoffset),1, &
+                a(localoffset,1),lda)
+
+      ! update D (symmetric) => two buffer vectors of size rank
+      ! generate y Vector
+      work(yoffset:yoffset+buffersize-1) = 0._rk8
+      call daxpy(buffersize,1.0_rk8/(alpha+beta),work(zoffset),1,work(yoffset),1)
+
+      ! generate v Vector
+      work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
+      call daxpy(buffersize, -0.5_rk8*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1)
+
+      ! symmetric update of D using y and v
+      call dsyr2("Upper", buffersize,-1.0_rk8, &
+                     work(yoffset),1,work(voffset),1, &
+                     seedD(1,1), k)
+
+      ! prepare T matrix inner products
+      ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y'
+      ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements
+      call dger(k-sidx,sidx,-1.0_rk8,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k)
+      seedC(k,k-sidx+1) = 1.0_rk8/(alpha+beta)
+#else /* DOUBLE_PRECISION_REAL */
+      call sgemv("Trans", buffersize+1, buffersize, &
+                 1.0_rk4,seedC(1,1),k,seedC(1,k-sidx+1),1, &
+                 1.0_rk4,work(coffset),1)
+
+      ! calculate z using tau,seedD,seedC and c Vector
+      work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize)
+      call saxpy(buffersize, 1.0_rk4/beta, work(coffset), 1, work(zoffset), 1)
+
+      ! update A1(local copy) and generate part of householder vectors for use
+      call saxpy(buffersize, -1.0_rk4, work(zoffset),1,seedC(k-sidx+1,1),k)
+      call sscal(buffersize, 1.0_rk4/(alpha+beta), seedC(1,k-sidx+1),1)
+      call sger(buffersize, buffersize, -1.0_rk4, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k)
+
+      ! update A global (householder Vector already generated by pdlarfgk)
+      mpirank_top = MOD(lidx/nb,mpiprocs)
+      if (mpirank .eq. mpirank_top) then
+        ! handle first row separately
+        topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0)
+        call saxpy(buffersize,-1.0_rk4,work(zoffset),1,a(topidx,1),lda)
+      end if
+
+      call sger(localsize, buffersize,-1.0_rk4, &
+                a(localoffset,k-sidx+1),1,work(zoffset),1, &
+                a(localoffset,1),lda)
+
+      ! update D (symmetric) => two buffer vectors of size rank
+      ! generate y Vector
+      work(yoffset:yoffset+buffersize-1) = 0._rk4
+      call saxpy(buffersize,1.0_rk4/(alpha+beta),work(zoffset),1,work(yoffset),1)
+
+      ! generate v Vector
+      work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
+      call saxpy(buffersize, -0.5_rk4*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1)
+
+      ! symmetric update of D using y and v
+      call ssyr2("Upper", buffersize,-1.0_rk4, &
+                     work(yoffset),1,work(voffset),1, &
+                     seedD(1,1), k)
+
+      ! prepare T matrix inner products
+      ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y'
+      ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements
+      call sger(k-sidx,sidx,-1.0_rk4,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k)
+      seedC(k,k-sidx+1) = 1.0_rk4/(alpha+beta)
+#endif /* DOUBLE_PRECISION_REAL */
+
+        call obj%timer%stop("qr_pdlarfgk_1dcomm_update_&
+            &PRECISION&
+            &")
+    end subroutine
+
+    subroutine qr_pdlarfgk_1dcomm_generateT_&
+          &PRECISION &
+          (obj,seedC,seedD,k,actualk,tau,t,ldt)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      integer(kind=ik)  :: k,actualk,ldt
+      real(kind=C_DATATYPE_KIND)     :: seedC(k,*),seedD(k,*),tau(*),t(ldt,*)
+
+      integer(kind=ik)  :: irow,icol
+      real(kind=C_DATATYPE_KIND)     :: column_coefficient
+        call obj%timer%start("qr_pdlarfgk_1dcomm_generateT_&
+            &PRECISION&
+            &")
+
+      !print *,'reversed on the fly T generation NYI'
+
+      do icol=1,actualk-1
+        ! calculate inner product of householder Vector parts in seedC
+        ! (actually calculating more than necessary, if actualk < k)
+        ! => a lot of junk from row 1 to row k-actualk
+#ifdef DOUBLE_PRECISION_REAL
+        call dtrmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1)
+#else
+        call strmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1)
+#endif
+        ! add scaled D parts to current column of C (will become later T rows)
+        column_coefficient = seedC(k,k-icol+1)
+        do irow=k-actualk+1,k-1
+          seedC(irow,k-icol+1) = ( seedC(irow,k-icol+1) ) +  ( seedD(irow,k-icol+1) * column_coefficient * seedC(k,irow) )
+        end do
+      end do
+
+      call qr_dlarft_kernel_&
+             &PRECISION &
+             (actualk,tau(k-actualk+1),seedC(k-actualk+1,k-actualk+2),k,t(k-actualk+1,k-actualk+1),ldt)
+      call obj%timer%stop("qr_pdlarfgk_1dcomm_generateT_&
+             &PRECISION&
+             &")
+
+    end subroutine
+
+    !direction=0: pack into work buffer
+    !direction=1: unpack from work buffer
+    subroutine qr_pdgeqrf_pack_unpack_&
+&PRECISION &
+(obj,v,ldv,work,lwork,m,n,mb,baseidx,rowidx,rev,direction,mpicomm)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input variables (local)
+      integer(kind=ik)   :: ldv,lwork
+      real(kind=C_DATATYPE_KIND)      :: v(ldv,*), work(*)
+
+      ! input variables (global)
+      integer(kind=ik)   :: m,n,mb,baseidx,rowidx,rev,direction,mpicomm
+
+      ! output variables (global)
+
+      ! local scalars
+      integer(kind=ik)   :: mpirank,mpiprocs
+      integer(kind=MPI_KIND) :: mpierr,mpirankMPI, mpiprocsMPI
+
+      integer(kind=ik)   :: buffersize,icol
+      integer(kind=ik)   :: local_size,baseoffset,offset
+
+      ! external functions
+        call obj%timer%start("qr_pdgeqrf_pack_unpack_&
+            &PRECISION&
+            &")
+      call mpi_comm_rank(int(mpicomm,kind=MPI_KIND) ,mpirankMPI,mpierr)
+      call mpi_comm_size(int(mpicomm,kind=MPI_KIND) ,mpiprocsMPI,mpierr)
+
+      mpirank = int(mpirankMPI,kind=c_int)
+      mpiprocs = int(mpiprocsMPI,kind=c_int)
+
+      call local_size_offset_1d(m,mb,baseidx,rowidx,rev,mpirank,mpiprocs, &
+                                    local_size,baseoffset,offset)
+
+      !print *,'pack/unpack',local_size,baseoffset,offset
+
+      ! rough approximate for buffer size
+      if (lwork .eq. -1) then
+        buffersize = local_size * n ! Vector elements
+        work(1) = DBLE(buffersize)
+        call obj%timer%stop("qr_pdgeqrf_pack_unpack_&
+            &PRECISION&
+            &")
+
+        return
+      end if
+
+      if (direction .eq. 0) then
+        ! copy v part to buffer (including zeros)
+        do icol=1,n
+          work(1+local_size*(icol-1):local_size*icol) = v(baseoffset:baseoffset+local_size-1,icol)
+        end do
+      else
+        ! copy v part from buffer (including zeros)
+        do icol=1,n
+          v(baseoffset:baseoffset+local_size-1,icol) = work(1+local_size*(icol-1):local_size*icol)
+        end do
+      end if
+        call obj%timer%stop("qr_pdgeqrf_pack_unpack_&
+            &PRECISION&
+            &")
+
+      return
+
+    end subroutine
+
+    !direction=0: pack into work buffer
+    !direction=1: unpack from work buffer
+    subroutine qr_pdgeqrf_pack_unpack_tmatrix_&
+          &PRECISION &
+          (obj,tau,t,ldt,work,lwork,n,direction)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input variables (local)
+      integer(kind=ik)  :: ldt,lwork
+      real(kind=C_DATATYPE_KIND)     :: work(*), t(ldt,*),tau(*)
+
+      ! input variables (global)
+      integer(kind=ik)  :: n,direction
+
+      ! output variables (global)
+
+      ! local scalars
+      integer(kind=ik)  :: icol
+
+      ! external functions
+        call obj%timer%start("qr_pdgeqrf_pack_unpack_tmatrix_&
+            &PRECISION&
+            &")
+
+      if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1) = real(n*n,kind=C_DATATYPE_KIND)
+#else
+        work(1) = real(n*n,kind=rk4)
+#endif
+
+        call obj%timer%stop("qr_pdgeqrf_pack_unpack_tmatrix_&
+            &PRECISION&
+            &")
+        return
+      end if
+
+      if (direction .eq. 0) then
+        ! append t matrix to buffer (including zeros)
+        do icol=1,n
+          work(1+(icol-1)*n:icol*n) = t(1:n,icol)
+        end do
+      else
+        ! append t matrix from buffer (including zeros)
+        do icol=1,n
+          t(1:n,icol) = work(1+(icol-1)*n:icol*n)
+          tau(icol) = t(icol,icol)
+        end do
+      end if
+        call obj%timer%stop("qr_pdgeqrf_pack_unpack_tmatrix_&
+            &PRECISION&
+            &")
+    end subroutine
+
+
+#ifndef ALREADY_DEFINED
+    ! TODO: encode following functionality
+    !   - Direction? BOTTOM UP or TOP DOWN ("Up", "Down")
+    !        => influences all related kernels (including DLARFT / DLARFB)
+    !   - rank-k parameter (k=1,2,...,b)
+    !        => influences possible update strategies
+    !        => parameterize the function itself? (FUNCPTR, FUNCARG)
+    !   - Norm mode? Allreduce, Allgather, AlltoAll, "AllHouse", (ALLNULL = benchmarking local kernels)
+    !   - subblocking
+    !         (maximum block size bounded by data distribution along rows)
+    !   - blocking method (householder vectors only or compact WY?)
+    !   - update strategy of trailing parts (incremental, complete)
+    !        - difference for subblocks and normal blocks? (UPDATE and UPDATESUB)
+    !        o "Incremental"
+    !        o "Full"
+    !   - final T generation (recursive: subblock wise, block wise, end) (TMERGE)
+    !        ' (implicitly given by / influences update strategies?)
+    !        => alternative: during update: iterate over sub t parts
+    !           => advantage: smaller (cache aware T parts)
+    !           => disadvantage: more memory write backs
+    !                (number of T parts * matrix elements)
+    !   - partial/sub T generation (TGEN)
+    !        o add vectors right after creation (Vector)
+    !        o add set of vectors (Set)
+    !   - bcast strategy of householder vectors to other process columns
+    !        (influences T matrix generation and trailing update
+    !         in other process columns)
+    !        o no broadcast (NONE = benchmarking?,
+    !            or not needed due to 1D process grid)
+    !        o after every housegen (VECTOR)
+    !        o after every subblk   (SUBBLOCK)
+    !        o after full local column block decomposition (BLOCK)
+    !  LOOP Housegen -> BCAST -> GENT/EXTENDT -> LOOP HouseLeft
+
+    !subroutine qr_pqrparam_init(PQRPARAM, DIRECTION, RANK, NORMMODE, &
+    !                             SUBBLK, UPDATE, TGEN, BCAST)
+    ! gmode: control communication pattern of dlarfg
+    ! maxrank: control max number of householder vectors per communication
+    ! eps: error threshold (integer)
+    ! update*: control update pattern in pdgeqr2_1dcomm ('incremental','full','merge')
+    !               merging = full update with tmatrix merging
+    ! tmerge*: 0: do not merge, 1: incremental merge, >1: recursive merge
+    !               only matters if update* == full
+    subroutine qr_pqrparam_init(obj,pqrparam,size2d,update2d,tmerge2d,size1d,update1d,tmerge1d,maxrank,update,eps,hgmode)
+      use precision
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input
+      CHARACTER         :: update2d,update1d,update,hgmode
+      INTEGER(kind=ik)  :: size2d,size1d,maxrank,eps,tmerge2d,tmerge1d
+
+      ! output
+#ifdef USE_ASSUMED_SIZE_QR
+      INTEGER(kind=ik)  :: PQRPARAM(*)
+#else
+      INTEGER(kind=ik)  :: PQRPARAM(1:11)
+#endif
+
+        call obj%timer%start("qr_pqrparam_init")
+
+      PQRPARAM(1) = size2d
+      PQRPARAM(2) = ichar(update2d)
+      PQRPARAM(3) = tmerge2d
+      ! TODO: broadcast T yes/no
+
+      PQRPARAM(4) = size1d
+      PQRPARAM(5) = ichar(update1d)
+      PQRPARAM(6) = tmerge1d
+
+      PQRPARAM(7) = maxrank
+      PQRPARAM(8) = ichar(update)
+      PQRPARAM(9) = eps
+      PQRPARAM(10) = ichar(hgmode)
+        call obj%timer%stop("qr_pqrparam_init")
+
+    end subroutine qr_pqrparam_init
+#endif /* ALREADY_DEFINED */
+
+    subroutine qr_pdlarfg_copy_1dcomm_&
+&PRECISION &
+(obj,x,incx,v,incv,n,baseidx,idx,nb,rev,mpicomm)
+      use precision
+      use elpa1_impl
+      use qr_utils_mod
+      use elpa_abstract_impl
+      implicit none
+      class(elpa_abstract_impl_t), intent(inout) :: obj
+      ! input variables (local)
+      integer(kind=ik)  :: incx,incv
+      real(kind=C_DATATYPE_KIND)     :: x(*), v(*)
+
+      ! input variables (global)
+      integer(kind=ik)  :: baseidx,idx,rev,nb,n
+      integer(kind=ik)  :: mpicomm
+
+      ! output variables (global)
+
+      ! local scalars
+      integer(kind=ik)  :: mpiprocs
+      integer(kind=MPI_KIND) ::  mpierr,mpiprocsMPI,mpirankMPI
+
+      integer(kind=ik)  :: mpirank,mpirank_top
+      integer(kind=ik)  :: irow,x_offset
+      integer(kind=ik)  :: v_offset,local_size
+
+        call obj%timer%start("qr_pdlarfg_copy_1dcomm_&
+            &PRECISION&
+            &")
+      call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+      call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+      mpirank = int(mpirankMPI,kind=c_int)
+      mpiprocs = int(mpiprocsMPI,kind=c_int)
+      call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
+                                local_size,v_offset,x_offset)
+      v_offset = v_offset * incv
+
+      !print *,'copy:',mpirank,baseidx,v_offset,x_offset,local_size
+
+      ! copy elements
+      do irow=1,local_size
+        v((irow-1)*incv+v_offset) = x((irow-1)*incx+x_offset)
+      end do
+
+      ! replace top element to build an unitary Vector
+      mpirank_top = MOD((idx-1)/nb,mpiprocs)
+      if (mpirank .eq. mpirank_top) then
+#ifdef DOUBLE_PRECISION_REAL
+        v(local_size*incv) = 1.0_rk8
+#else
+        v(local_size*incv) = 1.0_rk4
+#endif
+      end if
+        call obj%timer%stop("qr_pdlarfg_copy_1dcomm_&
+            &PRECISION&
+            &")
+
+    end subroutine
+
+! vim: syntax=fortran
diff -Nru elpa-2016.05.001/src/elpa2/qr/elpa_pdlarfb.F90 elpa-2019.11.001/src/elpa2/qr/elpa_pdlarfb.F90
--- elpa-2016.05.001/src/elpa2/qr/elpa_pdlarfb.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/qr/elpa_pdlarfb.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,93 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+module elpa_pdlarfb
+
+    use elpa1_compute
+    use qr_utils_mod
+    use elpa_qrkernels
+    use elpa_mpi
+    implicit none
+
+    PRIVATE
+
+    public :: qr_pdlarfb_1dcomm_double
+    public :: qr_pdlarft_pdlarfb_1dcomm_double
+    public :: qr_pdlarft_set_merge_1dcomm_double
+    public :: qr_pdlarft_tree_merge_1dcomm_double
+    public :: qr_pdlarfl_1dcomm_double
+    public :: qr_pdlarfl2_tmatrix_1dcomm_double
+    public :: qr_tmerge_pdlarfb_1dcomm_double
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+    public :: qr_pdlarfb_1dcomm_single
+    public :: qr_pdlarft_pdlarfb_1dcomm_single
+    public :: qr_pdlarft_set_merge_1dcomm_single
+    public :: qr_pdlarft_tree_merge_1dcomm_single
+    public :: qr_pdlarfl_1dcomm_single
+    public :: qr_pdlarfl2_tmatrix_1dcomm_single
+    public :: qr_tmerge_pdlarfb_1dcomm_single
+#endif
+
+contains
+  ! real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "elpa_pdlarfb_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  ! real single precision
+#define REALCASE 1
+#define ALREADY_DEFINED
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "elpa_pdlarfb_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+end module elpa_pdlarfb
diff -Nru elpa-2016.05.001/src/elpa2/qr/elpa_pdlarfb_template.F90 elpa-2019.11.001/src/elpa2/qr/elpa_pdlarfb_template.F90
--- elpa-2016.05.001/src/elpa2/qr/elpa_pdlarfb_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/qr/elpa_pdlarfb_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,868 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+subroutine qr_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,n,k,a,lda,v,ldv,tau,t,ldt,baseidx,idx,rev,mpicomm,work,lwork)
+    use precision
+    use qr_utils_mod
+
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik)  :: lda,ldv,ldt,lwork
+    real(kind=C_DATATYPE_KIND)     :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(k,*)
+
+    ! input variables (global)
+    integer(kind=ik)  :: m,mb,n,k,baseidx,idx,rev,mpicomm
+
+    ! output variables (global)
+
+    ! derived input variables from QR_PQRPARAM
+
+    ! local scalars
+    integer(kind=ik)  :: localsize,offset,baseoffset
+    integer(kind=ik)         :: mpirank, mpiprocs
+    integer(kind=MPI_KIND)  :: mpirankMPI, mpiprocsMPI, mpierr
+
+        if (idx .le. 1) return
+
+    if (n .le. 0) return ! nothing to do
+
+    if (k .eq. 1) then
+        call qr_pdlarfl_1dcomm_&
+  &PRECISION &
+  (v,1,baseidx,a,lda,tau(1), &
+                                work,lwork,m,n,idx,mb,rev,mpicomm)
+        return
+    else if (k .eq. 2) then
+        call qr_pdlarfl2_tmatrix_1dcomm_&
+  &PRECISION &
+  (v,ldv,baseidx,a,lda,t,ldt, &
+                                 work,lwork,m,n,idx,mb,rev,mpicomm)
+        return
+    end if
+
+    if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1,1) =real(2*k*n,kind=rk8)
+#else
+        work(1,1) =real(2*k*n,kind=rk4)
+#endif
+        return
+    end if
+
+    call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND) ,mpirankMPI,  mpierr)
+    call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+    mpirank = int(mpirankMPI,kind=c_int)
+    mpiprocs = int(mpiprocsMPI,kind=c_int)
+    ! use baseidx as idx here, otherwise the upper triangle part will be lost
+    ! during the calculation, especially in the reversed case
+    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
+                                localsize,baseoffset,offset)
+
+    ! Z' = Y' * A
+    if (localsize .gt. 0) then
+#ifdef DOUBLE_PRECISION_REAL
+        call dgemm("Trans","Notrans",k,n,localsize,1.0_rk8,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk8,work(1,1),k)
+#else
+        call sgemm("Trans","Notrans",k,n,localsize,1.0_rk4,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk4,work(1,1),k)
+#endif
+    else
+#ifdef DOUBLE_PRECISION_REAL
+        work(1:k,1:n) = 0.0_rk8
+#else
+        work(1:k,1:n) = 0.0_rk4
+#endif
+    end if
+
+    ! data exchange
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+    call mpi_allreduce(work(1,1),work(1,n+1),int(k*n,kind=MPI_KIND), mpi_real8, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND), mpierr)
+#else
+    call mpi_allreduce(work(1,1),work(1,n+1),int(k*n,kind=MPI_KIND), mpi_real4, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND), mpierr)
+#endif
+
+#else /* WITH_MPI */
+    work(1:k*n,n+1) = work(1:k*n,1)
+#endif
+    call qr_pdlarfb_kernel_local_&
+    &PRECISION &
+    (localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t,ldt,work(1,n+1),k)
+end subroutine
+
+! generalized pdlarfl2 version
+! TODO: include T merge here (seperate by "old" and "new" index)
+subroutine qr_pdlarft_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,n,oldk,k,v,ldv,tau,t,ldt,a,lda,baseidx,rev,mpicomm,work,lwork)
+    use precision
+    use qr_utils_mod
+
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik)  :: ldv,ldt,lda,lwork
+    real(kind=C_DATATYPE_KIND)     :: v(ldv,*),tau(*),t(ldt,*),work(k,*),a(lda,*)
+
+    ! input variables (global)
+    integer(kind=ik)  :: m,mb,n,k,oldk,baseidx,rev,mpicomm
+
+    ! output variables (global)
+
+    ! derived input variables from QR_PQRPARAM
+
+    ! local scalars
+    integer(kind=ik)  :: localsize,offset,baseoffset
+    integer(kind=ik)  :: mpirank, mpiprocs
+    integer(kind=MPI_KIND)  :: mpirankMPI, mpiprocsMPI, mpierr
+    integer(kind=ik)  :: icol
+
+    integer(kind=ik)  :: sendoffset,recvoffset,sendsize
+
+    sendoffset = 1
+    sendsize = k*(k+n+oldk)
+    recvoffset = sendoffset+(k+n+oldk)
+
+    if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1,1) = real(2*(k*k+k*n+oldk), kind=rk8)
+#else
+        work(1,1) = real(2*(k*k+k*n+oldk), kind=rk4)
+#endif
+        return
+    end if
+    call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND) ,mpirankMPI, mpierr)
+    call MPI_Comm_size(int(mpicomm,kind=MPI_KIND) ,mpiprocsMPI, mpierr)
+
+    mpirank = int(mpirankMPI,kind=c_int)
+    mpiprocs = int(mpiprocsMPI,kind=c_int)
+    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
+                                localsize,baseoffset,offset)
+
+#ifdef DOUBLE_PRECISION_REAL
+    if (localsize .gt. 0) then
+            ! calculate inner product of householdervectors
+            call dsyrk("Upper","Trans",k,localsize,1.0_rk8,v(baseoffset,1),ldv,0.0_rk8,work(1,1),k)
+
+            ! calculate matrix matrix product of householder vectors and target matrix
+            ! Z' = Y' * A
+            call dgemm("Trans","Notrans",k,n,localsize,1.0_rk8,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk8,work(1,k+1),k)
+
+            ! TODO: reserved for T merge parts
+            work(1:k,n+k+1:n+k+oldk) = 0.0_rk8
+    else
+        work(1:k,1:(n+k+oldk)) = 0.0_rk8
+    end if
+#else /* DOUBLE_PRECISION_REAL */
+    if (localsize .gt. 0) then
+            ! calculate inner product of householdervectors
+            call ssyrk("Upper","Trans",k,localsize,1.0_rk4,v(baseoffset,1),ldv,0.0_rk4,work(1,1),k)
+
+            ! calculate matrix matrix product of householder vectors and target matrix
+            ! Z' = Y' * A
+            call sgemm("Trans","Notrans",k,n,localsize,1.0_rk4,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk4,work(1,k+1),k)
+
+            ! TODO: reserved for T merge parts
+            work(1:k,n+k+1:n+k+oldk) = 0.0_rk4
+    else
+        work(1:k,1:(n+k+oldk)) = 0.0_rk4
+    end if
+#endif   /* DOUBLE_PRECISION_REAL */
+
+    ! exchange data
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+    call mpi_allreduce(work(1,sendoffset),work(1,recvoffset),int(sendsize,kind=MPI_KIND), mpi_real8, &
+                       mpi_sum, int(mpicomm,kind=MPI_KIND), mpierr)
+#else
+    call mpi_allreduce(work(1,sendoffset),work(1,recvoffset),int(sendsize,kind=MPI_KIND), mpi_real4, &
+                       mpi_sum, int(mpicomm,kind=MPI_KIND), mpierr)
+#endif
+
+#else /* WITH_MPI */
+    work(1:sendsize,recvoffset) = work(1:sendsize,sendoffset)
+#endif
+        ! generate T matrix (pdlarft)
+#ifdef DOUBLE_PRECISION_REAL
+        t(1:k,1:k) = 0.0_rk8 ! DEBUG: clear buffer first
+#else
+        t(1:k,1:k) = 0.0_rk4 ! DEBUG: clear buffer first
+#endif
+        ! T1 = tau1
+        ! | tauk  Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' |
+        ! | 0           Tk-1                           |
+        t(k,k) = tau(k)
+        do icol=k-1,1,-1
+            t(icol,icol+1:k) = -tau(icol)*work(icol,recvoffset+icol:recvoffset+k-1)
+#ifdef DOUBLE_PRECISION_REAL
+            call dtrmv("Upper","Trans","Nonunit",k-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
+#else
+            call strmv("Upper","Trans","Nonunit",k-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
+#endif
+            t(icol,icol) = tau(icol)
+        end do
+
+        ! TODO: elmroth and gustavson
+
+        ! update matrix (pdlarfb)
+        ! Z' = T * Z'
+#ifdef DOUBLE_PRECISION_REAL
+        call strmm("Left","Upper","Notrans","Nonunit",k,n,1.0_rk8,t,ldt,work(1,recvoffset+k),k)
+
+        ! A = A - Y * V'
+        call sgemm("Notrans","Notrans",localsize,n,k,-1.0_rk8,v(baseoffset,1),ldv,work(1,recvoffset+k),k,1.0_rk8,a(offset,1),lda)
+#else
+        call strmm("Left","Upper","Notrans","Nonunit",k,n,1.0_rk4,t,ldt,work(1,recvoffset+k),k)
+
+        ! A = A - Y * V'
+        call sgemm("Notrans","Notrans",localsize,n,k,-1.0_rk4,v(baseoffset,1),ldv,work(1,recvoffset+k),k,1.0_rk4,a(offset,1),lda)
+
+#endif
+end subroutine 
+
+subroutine qr_pdlarft_set_merge_1dcomm_&
+&PRECISION &
+(m,mb,n,blocksize,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
+    use precision
+    use qr_utils_mod
+
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik)  :: ldv,ldt,lwork
+    real(kind=C_DATATYPE_KIND)     :: v(ldv,*),t(ldt,*),work(n,*)
+
+    ! input variables (global)
+    integer(kind=ik)  :: m,mb,n,blocksize,baseidx,rev,mpicomm
+
+    ! output variables (global)
+
+    ! derived input variables from QR_PQRPARAM
+
+    ! local scalars
+    integer(kind=ik)  :: localsize,offset,baseoffset
+    integer(kind=ik)  :: mpirank,mpiprocs
+    integer(kind=MPI_KIND)  :: mpirankMPI, mpiprocsMPI, mpierr
+
+    if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1,1) = real(2*n*n,kind=rk8)
+#else
+        work(1,1) = real(2*n*n,kind=rk4)
+
+#endif
+        return
+    end if
+    call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI,  mpierr)
+    call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+    mpirank = int(mpirankMPI,kind=c_int)
+    mpiprocs = int(mpiprocsMPI,kind=c_int)
+    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
+                                localsize,baseoffset,offset)
+#ifdef DOUBLE_PRECISION_REAL
+    if (localsize .gt. 0) then
+        call dsyrk("Upper","Trans",n,localsize,1.0_rk8,v(baseoffset,1),ldv,0.0_rk8,work(1,1),n)
+    else
+        work(1:n,1:n) = 0.0_rk8
+    end if
+#else
+    if (localsize .gt. 0) then
+        call ssyrk("Upper","Trans",n,localsize,1.0_rk4,v(baseoffset,1),ldv,0.0_rk4,work(1,1),n)
+    else
+        work(1:n,1:n) = 0.0_rk4
+    end if
+
+#endif
+
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+    call mpi_allreduce(work(1,1),work(1,n+1),int(n*n,kind=MPI_KIND), mpi_real8, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND) ,mpierr)
+#else
+    call mpi_allreduce(work(1,1),work(1,n+1),int(n*n,kind=MPI_KIND), mpi_real4, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND) ,mpierr)
+#endif
+
+#else
+    work(1:n,n+1:n+1+n-1) = work(1:n,1:n)
+#endif
+        ! skip Y4'*Y4 part
+        offset = mod(n,blocksize)
+        if (offset .eq. 0) offset=blocksize
+        call qr_tmerge_set_kernel_&
+  &PRECISION &
+  (n,blocksize,t,ldt,work(1,n+1+offset),n)
+
+end subroutine
+
+subroutine qr_pdlarft_tree_merge_1dcomm_&
+&PRECISION &
+(m,mb,n,blocksize,treeorder,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
+    use precision
+    use qr_utils_mod
+
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik) :: ldv,ldt,lwork
+    real(kind=C_DATATYPE_KIND)    :: v(ldv,*),t(ldt,*),work(n,*)
+
+    ! input variables (global)
+    integer(kind=ik) :: m,mb,n,blocksize,treeorder,baseidx,rev,mpicomm
+
+    ! output variables (global)
+
+    ! derived input variables from QR_PQRPARAM
+
+    ! local scalars
+    integer(kind=ik) :: localsize,offset,baseoffset
+    integer(kind=ik)       :: mpirank, mpiprocs
+    integer(kind=MPI_KIND) :: mpirankMPI, mpiprocsMPI ,mpierr
+
+    if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1,1) = real(2*n*n,kind=rk8)
+#else
+        work(1,1) = real(2*n*n,kind=rk4)
+#endif
+        return
+    end if
+
+    if (n .le. blocksize) return ! nothing to do
+    call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI,  mpierr)
+    call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+    mpirank = int(mpirankMPI,kind=c_int)
+    mpiprocs = int(mpiprocsMPI,kind=c_int)
+    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
+                                localsize,baseoffset,offset)
+
+#ifdef DOUBLE_PRECISION_REAL
+    if (localsize .gt. 0) then
+        call dsyrk("Upper","Trans",n,localsize,1.0_rk8,v(baseoffset,1),ldv,0.0_rk8,work(1,1),n)
+    else
+        work(1:n,1:n) = 0.0_rk8
+    end if
+#else
+    if (localsize .gt. 0) then
+        call ssyrk("Upper","Trans",n,localsize,1.0_rk4,v(baseoffset,1),ldv,0.0_rk4,work(1,1),n)
+    else
+        work(1:n,1:n) = 0.0_rk4
+    end if
+#endif
+
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+    call mpi_allreduce(work(1,1),work(1,n+1),int(n*n,kind=MPI_KIND), mpi_real8, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND), mpierr)
+#else
+    call mpi_allreduce(work(1,1),work(1,n+1),int(n*n,kind=MPI_KIND), mpi_real4, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND), mpierr)
+#endif
+#else
+    work(1:n,n+1:n+1+n-1) = work(1:n,1:n)
+#endif
+        ! skip Y4'*Y4 part
+        offset = mod(n,blocksize)
+        if (offset .eq. 0) offset=blocksize
+        call qr_tmerge_tree_kernel_&
+  &PRECISION &
+  (n,blocksize,treeorder,t,ldt,work(1,n+1+offset),n)
+
+end subroutine
+
+! apply householder Vector to the left
+! - assume unitary matrix
+! - assume right positions for v
+subroutine qr_pdlarfl_1dcomm_&
+&PRECISION &
+(v,incv,baseidx,a,lda,tau,work,lwork,m,n,idx,mb,rev,mpicomm)
+    use precision
+    use elpa1_impl
+    use qr_utils_mod
+
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik) :: incv,lda,lwork,baseidx
+    real(kind=C_DATATYPE_KIND)    :: v(*),a(lda,*),work(*)
+
+    ! input variables (global)
+    integer(kind=ik) :: m,n,mb,rev,idx,mpicomm
+    real(kind=C_DATATYPE_KIND)    :: tau
+
+    ! output variables (global)
+
+    ! local scalars
+    integer(kind=ik)       :: mpirank, mpiprocs
+    integer(kind=MPI_KIND) :: mpierr, mpirankMPI, mpiprocsMPI
+    integer(kind=ik) :: sendsize,recvsize,icol
+    integer(kind=ik) :: local_size,local_offset
+    integer(kind=ik) :: v_local_offset
+
+    ! external functions
+    real(kind=C_DATATYPE_KIND), external :: ddot
+    call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+    call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+    mpirank = int(mpirankMPI, kind=c_int)
+    mpiprocs = int(mpiprocsMPI, kind=c_int)
+    sendsize = n
+    recvsize = sendsize
+
+    if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1) = real(sendsize + recvsize,kind=rk8)
+#else
+        work(1) = real(sendsize + recvsize,kind=rk4)
+#endif
+        return
+    end if
+
+    if (n .le. 0) return
+
+        if (idx .le. 1) return
+
+    call local_size_offset_1d(m,mb,baseidx,idx,rev,mpirank,mpiprocs, &
+                              local_size,v_local_offset,local_offset)
+
+    !print *,'hl ref',local_size,n
+
+    v_local_offset = v_local_offset * incv
+
+    if (local_size > 0) then
+
+        do icol=1,n
+            work(icol) = dot_product(v(v_local_offset:v_local_offset+local_size-1),a(local_offset:local_offset+local_size-1,icol))
+
+        end do
+    else
+#ifdef DOUBLE_PRECISION_REAL
+        work(1:n) = 0.0_rk8
+#else
+        work(1:n) = 0.0_rk4
+#endif
+    end if
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+    call mpi_allreduce(work, work(sendsize+1), int(sendsize,kind=MPI_KIND), mpi_real8, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND), mpierr)
+#else
+    call mpi_allreduce(work, work(sendsize+1), int(sendsize,kind=MPI_KIND), mpi_real4, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND), mpierr)
+#endif
+#else
+    work(sendsize+1:sendsize+1+sendsize+1+sendsize-1) = work(1:sendsize)
+#endif
+    if (local_size > 0) then
+
+         do icol=1,n
+               a(local_offset:local_offset+local_size-1,icol) = a(local_offset:local_offset+local_size-1,icol) &
+                                                                - tau*work(sendsize+icol)*v(v_local_offset:v_local_offset+ &
+                                                                           local_size-1)
+         enddo
+    end if
+
+end subroutine
+
+subroutine qr_pdlarfl2_tmatrix_1dcomm_&
+&PRECISION &
+(v,ldv,baseidx,a,lda,t,ldt,work,lwork,m,n,idx,mb,rev,mpicomm)
+    use precision
+    use elpa1_impl
+    use qr_utils_mod
+
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik) :: ldv,lda,lwork,baseidx,ldt
+    real(kind=C_DATATYPE_KIND)    :: v(ldv,*),a(lda,*),work(*),t(ldt,*)
+
+    ! input variables (global)
+    integer(kind=ik) :: m,n,mb,rev,idx,mpicomm
+
+    ! output variables (global)
+
+    ! local scalars
+    integer(kind=ik) :: mpirank,mpiprocs,mpirank_top1,mpirank_top2
+    integer(kind=MPI_KIND) :: mpierr, mpirankMPI, mpiprocsMPI
+    integer(kind=ik) :: dgemv1_offset,dgemv2_offset
+    integer(kind=ik) :: sendsize, recvsize
+    integer(kind=ik) :: local_size1,local_offset1
+    integer(kind=ik) :: local_size2,local_offset2
+    integer(kind=ik) :: local_size_dger,local_offset_dger
+    integer(kind=ik) :: v1_local_offset,v2_local_offset
+    integer(kind=ik) :: v_local_offset_dger
+    real(kind=C_DATATYPE_KIND)    :: hvdot
+    integer(kind=ik) :: irow,icol,v1col,v2col
+
+    ! external functions
+    real(kind=C_DATATYPE_KIND), external :: ddot
+    call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+    call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+    mpirank = int(mpirankMPI,kind=c_int)
+    mpiprocs = int(mpiprocsMPI,kind=c_int)
+    sendsize = 2*n
+    recvsize = sendsize
+
+    if (lwork .eq. -1) then
+        work(1) = sendsize + recvsize
+        return
+    end if
+
+    dgemv1_offset = 1
+    dgemv2_offset = dgemv1_offset + n
+
+        ! in 2x2 matrix case only one householder Vector was generated
+        if (idx .le. 2) then
+            call qr_pdlarfl_1dcomm_&
+      &PRECISION &
+      (v(1,2),1,baseidx,a,lda,t(2,2), &
+                                    work,lwork,m,n,idx,mb,rev,mpicomm)
+            return
+        end if
+
+        call local_size_offset_1d(m,mb,baseidx,idx,rev,mpirank,mpiprocs, &
+                                  local_size1,v1_local_offset,local_offset1)
+        call local_size_offset_1d(m,mb,baseidx,idx-1,rev,mpirank,mpiprocs, &
+                                  local_size2,v2_local_offset,local_offset2)
+
+        v1_local_offset = v1_local_offset * 1
+        v2_local_offset = v2_local_offset * 1
+
+        v1col = 2
+        v2col = 1
+
+        ! keep buffers clean in case that local_size1/local_size2 are zero
+#ifdef DOUBLE_PRECISION_REAL
+        work(1:sendsize) = 0.0_rk8
+
+        call dgemv("Trans",local_size1,n,1.0_rk8,a(local_offset1,1),lda,v(v1_local_offset,v1col),1,0.0_rk8,work(dgemv1_offset),1)
+        call dgemv("Trans",local_size2,n,t(v2col,v2col),a(local_offset2,1),lda,v(v2_local_offset,v2col),1,0.0_rk8, &
+                   work(dgemv2_offset),1)
+#else
+        work(1:sendsize) = 0.0_rk4
+
+        call sgemv("Trans",local_size1,n,1.0_rk4,a(local_offset1,1),lda,v(v1_local_offset,v1col),1,0.0_rk4,work(dgemv1_offset),1)
+        call sgemv("Trans",local_size2,n,t(v2col,v2col),a(local_offset2,1),lda,v(v2_local_offset,v2col),1,0.0_rk4, &
+                   work(dgemv2_offset),1)
+#endif
+
+#ifdef WITH_MPI
+
+#ifdef DOUBLE_PRECISION_REAL
+        call mpi_allreduce(work, work(sendsize+1), int(sendsize,kind=MPI_KIND), mpi_real8, mpi_sum, &
+                           int(mpicomm,kind=MPI_KIND), mpierr)
+#else
+        call mpi_allreduce(work, work(sendsize+1), int(sendsize,kind=MPI_KIND), mpi_real4, mpi_sum, &
+                           int(mpicomm,kind=MPI_KIND), mpierr)
+#endif
+#else
+        work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
+#endif
+        ! update second Vector
+#ifdef DOUBLE_PRECISION_REAL
+        call daxpy(n,t(1,2),work(sendsize+dgemv1_offset),1,work(sendsize+dgemv2_offset),1)
+#else
+        call saxpy(n,t(1,2),work(sendsize+dgemv1_offset),1,work(sendsize+dgemv2_offset),1)
+#endif
+
+        call local_size_offset_1d(m,mb,baseidx,idx-2,rev,mpirank,mpiprocs, &
+                                  local_size_dger,v_local_offset_dger,local_offset_dger)
+
+        ! get ranks of processes with topelements
+        mpirank_top1 = MOD((idx-1)/mb,mpiprocs)
+        mpirank_top2 = MOD((idx-2)/mb,mpiprocs)
+
+        if (mpirank_top1 .eq. mpirank) local_offset1 = local_size1
+        if (mpirank_top2 .eq. mpirank) then
+            local_offset2 = local_size2
+            v2_local_offset = local_size2
+        end if
+
+    ! use hvdot as temporary variable
+    hvdot = t(v1col,v1col)
+    do icol=1,n
+        ! make use of "1" entries in householder vectors
+        if (mpirank_top1 .eq. mpirank) then
+            a(local_offset1,icol) = a(local_offset1,icol) &
+                                    - work(sendsize+dgemv1_offset+icol-1)*hvdot
+        end if
+
+        if (mpirank_top2 .eq. mpirank) then
+            a(local_offset2,icol) = a(local_offset2,icol) &
+                                    - v(v2_local_offset,v1col)*work(sendsize+dgemv1_offset+icol-1)*hvdot &
+                                    - work(sendsize+dgemv2_offset+icol-1)
+        end if
+
+        do irow=1,local_size_dger
+            a(local_offset_dger+irow-1,icol) = a(local_offset_dger+irow-1,icol) &
+                                    - work(sendsize+dgemv1_offset+icol-1)*v(v_local_offset_dger+irow-1,v1col)*hvdot &
+                                    - work(sendsize+dgemv2_offset+icol-1)*v(v_local_offset_dger+irow-1,v2col)
+        end do
+    end do
+
+end subroutine
+
+! generalized pdlarfl2 version
+! TODO: include T merge here (seperate by "old" and "new" index)
+subroutine qr_tmerge_pdlarfb_1dcomm_&
+&PRECISION &
+(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
+    use precision
+    use qr_utils_mod
+
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik) :: ldv,ldt,lda,lwork
+    real(kind=C_DATATYPE_KIND)    :: v(ldv,*),t(ldt,*),work(*),a(lda,*)
+
+    ! input variables (global)
+    integer(kind=ik) :: m,mb,n,k,oldk,baseidx,rev,updatemode,mpicomm
+
+    ! output variables (global)
+
+    ! derived input variables from QR_PQRPARAM
+
+    ! local scalars
+    integer(kind=ik) :: localsize,offset,baseoffset
+    integer(kind=ik) :: mpirank, mpiprocs
+    integer(kind=MPI_KIND) :: mpirankMPI, mpiprocsMPI, mpierr
+
+    integer(kind=ik) :: sendoffset,recvoffset,sendsize
+    integer(kind=ik) :: updateoffset,updatelda,updatesize
+    integer(kind=ik) :: mergeoffset,mergelda,mergesize
+    integer(kind=ik) :: tgenoffset,tgenlda,tgensize
+
+    ! quickfix
+    mergeoffset = 0
+
+        if (updatemode .eq. ichar('I')) then
+            updatelda = oldk+k
+        else
+            updatelda = k
+        end if
+
+        updatesize = updatelda*n
+
+        mergelda = k
+        mergesize = mergelda*oldk
+
+        tgenlda = 0
+        tgensize = 0
+
+        sendsize = updatesize + mergesize + tgensize
+
+    if (lwork .eq. -1) then
+#ifdef DOUBLE_PRECISION_REAL
+        work(1) = real(2*sendsize,kind=rk8)
+#else
+        work(1) = real(2*sendsize,kind=rk4)
+#endif
+        return
+    end if
+    call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI,  mpierr)
+    call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+    mpirank = int(mpirankMPI,kind=c_int)
+    mpiprocs = int(mpiprocsMPI,kind=c_int)
+    ! use baseidx as idx here, otherwise the upper triangle part will be lost
+    ! during the calculation, especially in the reversed case
+    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
+                                localsize,baseoffset,offset)
+
+    sendoffset = 1
+
+        if (oldk .gt. 0) then
+            updateoffset = 0
+            mergeoffset = updateoffset + updatesize
+            tgenoffset = mergeoffset + mergesize
+
+            sendsize = updatesize + mergesize + tgensize
+
+            !print *,'sendsize',sendsize,updatesize,mergesize,tgensize
+            !print *,'merging nr of rotations', oldk+k
+#ifdef DOUBLE_PRECISION_REAL
+            if (localsize .gt. 0) then
+                ! calculate matrix matrix product of householder vectors and target matrix
+                if (updatemode .eq. ichar('I')) then
+                    ! Z' = (Y1,Y2)' * A
+                    call dgemm("Trans","Notrans",k+oldk,n,localsize,1.0_rk8,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk8, &
+                               work(sendoffset+updateoffset),updatelda)
+                else
+                    ! Z' = Y1' * A
+                    call dgemm("Trans","Notrans",k,n,localsize,1.0_rk8,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk8, &
+                               work(sendoffset+updateoffset),updatelda)
+                end if
+
+                ! calculate parts needed for T merge
+                call dgemm("Trans","Notrans",k,oldk,localsize,1.0_rk8,v(baseoffset,1),ldv,v(baseoffset,k+1),ldv,0.0_rk8, &
+                           work(sendoffset+mergeoffset),mergelda)
+
+            else
+                ! cleanup buffer
+                work(sendoffset:sendoffset+sendsize-1) = 0.0_rk8
+            end if
+#else /* DOUBLE_PRECISION_REAL */
+            if (localsize .gt. 0) then
+                ! calculate matrix matrix product of householder vectors and target matrix
+                if (updatemode .eq. ichar('I')) then
+                    ! Z' = (Y1,Y2)' * A
+                    call sgemm("Trans","Notrans",k+oldk,n,localsize,1.0_rk4,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk4, &
+                               work(sendoffset+updateoffset),updatelda)
+                else
+                    ! Z' = Y1' * A
+                    call sgemm("Trans","Notrans",k,n,localsize,1.0_rk4,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk4, &
+                               work(sendoffset+updateoffset),updatelda)
+                end if
+
+                ! calculate parts needed for T merge
+                call sgemm("Trans","Notrans",k,oldk,localsize,1.0_rk4,v(baseoffset,1),ldv,v(baseoffset,k+1),ldv,0.0_rk4, &
+                           work(sendoffset+mergeoffset),mergelda)
+
+            else
+                ! cleanup buffer
+                work(sendoffset:sendoffset+sendsize-1) = 0.0_rk4
+            end if
+#endif /* DOUBLE_PRECISION_REAL */
+
+        else
+            ! do not calculate parts for T merge as there is nothing to merge
+
+            mergeoffset  = 0
+            updateoffset = 0
+
+            tgenoffset = updateoffset + updatesize
+
+            sendsize = updatesize + tgensize
+#ifdef DOUBLE_PRECISION_REAL
+            if (localsize .gt. 0) then
+                ! calculate matrix matrix product of householder vectors and target matrix
+                ! Z' = (Y1)' * A
+                call dgemm("Trans","Notrans",k,n,localsize,1.0_rk8,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk8, &
+                           work(sendoffset+updateoffset),updatelda)
+
+            else
+                ! cleanup buffer
+                work(sendoffset:sendoffset+sendsize-1) = 0.0_rk8
+            end if
+#else
+            if (localsize .gt. 0) then
+                ! calculate matrix matrix product of householder vectors and target matrix
+                ! Z' = (Y1)' * A
+                call sgemm("Trans","Notrans",k,n,localsize,1.0_rk4,v(baseoffset,1),ldv,a(offset,1),lda,0.0_rk4, &
+                           work(sendoffset+updateoffset),updatelda)
+
+            else
+                ! cleanup buffer
+                work(sendoffset:sendoffset+sendsize-1) = 0.0_rk4
+            end if
+#endif
+        end if
+
+    recvoffset = sendoffset + sendsize
+
+    if (sendsize .le. 0) return ! nothing to do
+
+    ! exchange data
+#ifdef WITH_MPI
+#ifdef DOUBLE_PRECISION_REAL
+    call mpi_allreduce(work(sendoffset),work(recvoffset), int(sendsize,kind=MPI_KIND), mpi_real8, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND) ,mpierr)
+#else
+    call mpi_allreduce(work(sendoffset),work(recvoffset), int(sendsize,kind=MPI_KIND), mpi_real4, mpi_sum, &
+                       int(mpicomm,kind=MPI_KIND) ,mpierr)
+#endif
+
+#else
+    work(recvoffset:recvoffset+sendsize-1) = work(sendoffset:sendoffset+sendsize-1)
+#endif
+    updateoffset = recvoffset+updateoffset
+    mergeoffset = recvoffset+mergeoffset
+    tgenoffset = recvoffset+tgenoffset
+
+        if (oldk .gt. 0) then
+            call qr_pdlarft_merge_kernel_local_&
+      &PRECISION &
+      (oldk,k,t,ldt,work(mergeoffset),mergelda)
+
+            if (localsize .gt. 0) then
+                if (updatemode .eq. ichar('I')) then
+
+                    ! update matrix (pdlarfb) with complete T
+                    call qr_pdlarfb_kernel_local_&
+        &PRECISION &
+        (localsize,n,k+oldk,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
+                                                 work(updateoffset),updatelda)
+                else
+                    ! update matrix (pdlarfb) with small T (same as update with no old T TODO)
+                    call qr_pdlarfb_kernel_local_&
+        &PRECISION &
+        (localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
+                                                 work(updateoffset),updatelda)
+                end if
+            end if
+        else
+            if (localsize .gt. 0) then
+                ! update matrix (pdlarfb) with small T
+                call qr_pdlarfb_kernel_local_&
+    &PRECISION &
+    (localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
+                                             work(updateoffset),updatelda)
+            end if
+        end if
+
+end subroutine
diff -Nru elpa-2016.05.001/src/elpa2/qr/elpa_qrkernels.F90 elpa-2019.11.001/src/elpa2/qr/elpa_qrkernels.F90
--- elpa-2016.05.001/src/elpa2/qr/elpa_qrkernels.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/qr/elpa_qrkernels.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,67 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+
+#include "config-f90.h"
+
+module elpa_qrkernels
+  implicit none
+  public
+
+  contains
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "elpa_qrkernels_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "elpa_qrkernels_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+end module
diff -Nru elpa-2016.05.001/src/elpa2/qr/elpa_qrkernels_template.F90 elpa-2019.11.001/src/elpa2/qr/elpa_qrkernels_template.F90
--- elpa-2016.05.001/src/elpa2/qr/elpa_qrkernels_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/qr/elpa_qrkernels_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,837 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+
+#endif
+
+! calculates A = A - Y*T'*Z (rev=0)
+! calculates A = A - Y*T*Z (rev=1)
+! T upper triangle matrix
+! assuming zero entries in matrix in upper kxk block
+
+subroutine qr_pdlarfb_kernel_local_&
+&PRECISION &
+(m,n,k,a,lda,v,ldv,t,ldt,z,ldz)
+    use precision
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik) :: lda,ldv,ldt,ldz
+    real(kind=REAL_DATATYPE)    :: a(lda,*),v(ldv,*),t(ldt,*),z(ldz,*)
+
+    ! input variables (global)
+    integer(kind=ik) :: m,n,k
+
+    ! local variables
+    real(kind=REAL_DATATYPE)    :: t11
+    real(kind=REAL_DATATYPE)    :: t12,t22,sum1,sum2
+    real(kind=REAL_DATATYPE)    :: t13,t23,t33,sum3
+    real(kind=REAL_DATATYPE)    :: sum4,t44
+    real(kind=REAL_DATATYPE)    :: y1,y2,y3,y4
+    real(kind=REAL_DATATYPE)    :: a1
+    integer(kind=ik) :: icol,irow,v1col,v2col,v3col
+
+    ! reference implementation
+    if (k .eq. 1) then
+        t11 = t(1,1)
+        do icol=1,n
+            sum1 = z(1,icol)
+            a(1:m,icol) = a(1:m,icol) - t11*sum1*v(1:m,1)
+        enddo
+        return
+    else if (k .eq. 2) then
+            v1col = 2
+            v2col = 1
+            t22 = t(1,1)
+            t12 = t(1,2)
+            t11 = t(2,2)
+
+        do icol=1,n
+            sum1 = t11 * z(v1col,icol)
+            sum2 = t12 * z(v1col,icol) + t22 * z(v2col,icol)
+
+            do irow=1,m
+                a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2
+            end do
+        end do
+    else if (k .eq. 3) then
+            v1col = 3
+            v2col = 2
+            v3col = 1
+
+            t33 = t(1,1)
+
+            t23 = t(1,2)
+            t22 = t(2,2)
+
+            t13 = t(1,3)
+            t12 = t(2,3)
+            t11 = t(3,3)
+
+        do icol=1,n
+            ! misusing variables for fetch of z parts
+            y1 = z(v1col,icol)
+            y2 = z(v2col,icol)
+            y3 = z(v3col,icol)
+
+            sum1 = t11 * y1!+ 0   * y2!+ 0   * y3
+            sum2 = t12 * y1 + t22 * y2!+ 0   * y3
+            sum3 = t13 * y1 + t23 * y2 + t33 * y3
+
+            do irow=1,m
+                a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2 - v(irow,v3col) * sum3
+            end do
+        end do
+    else if (k .eq. 4) then
+            do icol=1,n
+                ! misusing variables for fetch of z parts
+                y1 = z(1,icol)
+                y2 = z(2,icol)
+                y3 = z(3,icol)
+                y4 = z(4,icol)
+
+                ! dtrmv like - starting from main diagonal and working
+                ! upwards
+                t11 = t(1,1)
+                t22 = t(2,2)
+                t33 = t(3,3)
+                t44 = t(4,4)
+
+                sum1 = t11 * y1
+                sum2 = t22 * y2
+                sum3 = t33 * y3
+                sum4 = t44 * y4
+
+                t11 = t(1,2)
+                t22 = t(2,3)
+                t33 = t(3,4)
+
+                sum1 = sum1 + t11 * y2
+                sum2 = sum2 + t22 * y3
+                sum3 = sum3 + t33 * y4
+
+                t11 = t(1,3)
+                t22 = t(2,4)
+
+                sum1 = sum1 + t11 * y3
+                sum2 = sum2 + t22 * y4
+
+                t11 = t(1,4)
+                sum1 = sum1 + t11 * y4
+
+                ! one column of V is calculated
+                ! time to calculate A - Y * V
+                do irow=1,m ! TODO: loop unrolling
+                    y1 = v(irow,1)
+                    y2 = v(irow,2)
+                    y3 = v(irow,3)
+                    y4 = v(irow,4)
+
+                    a1 = a(irow,icol)
+
+                    a1 = a1 - y1*sum1
+                    a1 = a1 - y2*sum2
+                    a1 = a1 - y3*sum3
+                    a1 = a1 - y4*sum4
+
+                    a(irow,icol) = a1
+                end do
+            end do
+    else
+        ! reference implementation
+#ifdef DOUBLE_PRECISION_REAL
+            ! V' = T * Z'
+            call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0_rk8,t,ldt,z,ldz)
+            ! A = A - Y * V'
+            call dgemm("Notrans","Notrans",m,n,k,-1.0_rk8,v,ldv,z,ldz,1.0_rk8,a,lda)
+#else
+            ! V' = T * Z'
+            call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0_rk4,t,ldt,z,ldz)
+            ! A = A - Y * V'
+            call dgemm("Notrans","Notrans",m,n,k,-1.0_rk4,v,ldv,z,ldz,1.0_rk4,a,lda)
+#endif
+    end if
+
+end subroutine
+
+subroutine qr_pdlarft_merge_kernel_local_&
+&PRECISION &
+(oldk,k,t,ldt,yty,ldy)
+    use precision
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik) :: ldt,ldy
+    real(kind=REAL_DATATYPE)    :: t(ldt,*),yty(ldy,*)
+
+    ! input variables (global)
+    integer(kind=ik) :: k,oldk
+
+    ! output variables (global)
+
+    ! local scalars
+    integer(kind=ik) :: icol,leftk,rightk
+
+    ! local scalars for optimized versions
+    integer(kind=ik) :: irow
+    real(kind=REAL_DATATYPE)    :: t11
+    real(kind=REAL_DATATYPE)    :: yty1,yty2,yty3,yty4,yty5,yty6,yty7,yty8
+    real(kind=REAL_DATATYPE)    :: reg01,reg02,reg03,reg04,reg05,reg06,reg07,reg08
+    real(kind=REAL_DATATYPE)    :: final01,final02,final03,final04,final05,final06,final07,final08
+
+    if (oldk .eq. 0) return ! nothing to be done
+
+        leftk = k
+        rightk = oldk
+
+    ! optimized implementations:
+    if (leftk .eq. 1) then
+        do icol=1,rightk
+            ! multiply inner products with right t matrix
+            ! (dtrmv like)
+            yty1 = yty(1,1)
+            t11 = t(leftk+1,leftk+icol)
+
+            reg01 = yty1 * t11
+
+            do irow=2,icol
+                yty1 = yty(1,irow)
+                t11 = t(leftk+irow,leftk+icol)
+
+                reg01 = reg01 + yty1 * t11
+            end do
+
+            ! multiply intermediate results with left t matrix and store in final t
+            ! matrix
+            t11 = -t(1,1)
+            final01 = t11 * reg01
+            t(1,leftk+icol) = final01
+        end do
+
+        !print *,'efficient tmerge - leftk=1'
+    else if (leftk .eq. 2) then
+        do icol=1,rightk
+            ! multiply inner products with right t matrix
+            ! (dtrmv like)
+            yty1 = yty(1,1)
+            yty2 = yty(2,1)
+
+            t11  = t(leftk+1,leftk+icol)
+
+            reg01 = yty1 * t11
+            reg02 = yty2 * t11
+
+            do irow=2,icol
+                yty1 = yty(1,irow)
+                yty2 = yty(2,irow)
+                t11 = t(leftk+irow,leftk+icol)
+
+                reg01 = reg01 + yty1 * t11
+                reg02 = reg02 + yty2 * t11
+            end do
+
+            ! multiply intermediate results with left t matrix and store in final t
+            ! matrix
+            yty1 = -t(1,1)
+            yty2 = -t(1,2)
+            yty3 = -t(2,2)
+
+            final01 = reg02 * yty2
+            final02 = reg02 * yty3
+
+            final01 = final01 + reg01 * yty1
+
+            t(1,leftk+icol) = final01
+            t(2,leftk+icol) = final02
+        end do
+
+        !print *,'efficient tmerge - leftk=2'
+    else if (leftk .eq. 4) then
+        do icol=1,rightk
+            ! multiply inner products with right t matrix
+            ! (dtrmv like)
+            yty1 = yty(1,1)
+            yty2 = yty(2,1)
+            yty3 = yty(3,1)
+            yty4 = yty(4,1)
+
+            t11  = t(leftk+1,leftk+icol)
+
+            reg01 = yty1 * t11
+            reg02 = yty2 * t11
+            reg03 = yty3 * t11
+            reg04 = yty4 * t11
+
+            do irow=2,icol
+                yty1 = yty(1,irow)
+                yty2 = yty(2,irow)
+                yty3 = yty(3,irow)
+                yty4 = yty(4,irow)
+
+                t11 = t(leftk+irow,leftk+icol)
+
+                reg01 = reg01 + yty1 * t11
+                reg02 = reg02 + yty2 * t11
+                reg03 = reg03 + yty3 * t11
+                reg04 = reg04 + yty4 * t11
+            end do
+
+            ! multiply intermediate results with left t matrix and store in final t
+            ! matrix (start from diagonal and move upwards)
+            yty1 = -t(1,1)
+            yty2 = -t(2,2)
+            yty3 = -t(3,3)
+            yty4 = -t(4,4)
+
+            ! main diagonal
+            final01 = reg01 * yty1
+            final02 = reg02 * yty2
+            final03 = reg03 * yty3
+            final04 = reg04 * yty4
+
+            ! above main diagonal
+            yty1 = -t(1,2)
+            yty2 = -t(2,3)
+            yty3 = -t(3,4)
+
+            final01 = final01 + reg02 * yty1
+            final02 = final02 + reg03 * yty2
+            final03 = final03 + reg04 * yty3
+
+            ! above first side diagonal
+            yty1 = -t(1,3)
+            yty2 = -t(2,4)
+
+            final01 = final01 + reg03 * yty1
+            final02 = final02 + reg04 * yty2
+
+            ! above second side diagonal
+            yty1 = -t(1,4)
+
+            final01 = final01 + reg04 * yty1
+
+            ! write back to final matrix
+            t(1,leftk+icol) = final01
+            t(2,leftk+icol) = final02
+            t(3,leftk+icol) = final03
+            t(4,leftk+icol) = final04
+        end do
+
+        !print *,'efficient tmerge - leftk=4'
+    else if (leftk .eq. 8) then
+        do icol=1,rightk
+            ! multiply inner products with right t matrix
+            ! (dtrmv like)
+            yty1 = yty(1,1)
+            yty2 = yty(2,1)
+            yty3 = yty(3,1)
+            yty4 = yty(4,1)
+            yty5 = yty(5,1)
+            yty6 = yty(6,1)
+            yty7 = yty(7,1)
+            yty8 = yty(8,1)
+
+            t11  = t(leftk+1,leftk+icol)
+
+            reg01 = yty1 * t11
+            reg02 = yty2 * t11
+            reg03 = yty3 * t11
+            reg04 = yty4 * t11
+            reg05 = yty5 * t11
+            reg06 = yty6 * t11
+            reg07 = yty7 * t11
+            reg08 = yty8 * t11
+
+            do irow=2,icol
+                yty1 = yty(1,irow)
+                yty2 = yty(2,irow)
+                yty3 = yty(3,irow)
+                yty4 = yty(4,irow)
+                yty5 = yty(5,irow)
+                yty6 = yty(6,irow)
+                yty7 = yty(7,irow)
+                yty8 = yty(8,irow)
+
+                t11 = t(leftk+irow,leftk+icol)
+
+                reg01 = reg01 + yty1 * t11
+                reg02 = reg02 + yty2 * t11
+                reg03 = reg03 + yty3 * t11
+                reg04 = reg04 + yty4 * t11
+                reg05 = reg05 + yty5 * t11
+                reg06 = reg06 + yty6 * t11
+                reg07 = reg07 + yty7 * t11
+                reg08 = reg08 + yty8 * t11
+            end do
+
+            ! multiply intermediate results with left t matrix and store in final t
+            ! matrix (start from diagonal and move upwards)
+            yty1 = -t(1,1)
+            yty2 = -t(2,2)
+            yty3 = -t(3,3)
+            yty4 = -t(4,4)
+            yty5 = -t(5,5)
+            yty6 = -t(6,6)
+            yty7 = -t(7,7)
+            yty8 = -t(8,8)
+
+            ! main diagonal
+            final01 = reg01 * yty1
+            final02 = reg02 * yty2
+            final03 = reg03 * yty3
+            final04 = reg04 * yty4
+            final05 = reg05 * yty5
+            final06 = reg06 * yty6
+            final07 = reg07 * yty7
+            final08 = reg08 * yty8
+
+            ! above main diagonal
+            yty1 = -t(1,2)
+            yty2 = -t(2,3)
+            yty3 = -t(3,4)
+            yty4 = -t(4,5)
+            yty5 = -t(5,6)
+            yty6 = -t(6,7)
+            yty7 = -t(7,8)
+
+            final01 = final01 + reg02 * yty1
+            final02 = final02 + reg03 * yty2
+            final03 = final03 + reg04 * yty3
+            final04 = final04 + reg05 * yty4
+            final05 = final05 + reg06 * yty5
+            final06 = final06 + reg07 * yty6
+            final07 = final07 + reg08 * yty7
+
+            ! above first side diagonal
+            yty1 = -t(1,3)
+            yty2 = -t(2,4)
+            yty3 = -t(3,5)
+            yty4 = -t(4,6)
+            yty5 = -t(5,7)
+            yty6 = -t(6,8)
+
+            final01 = final01 + reg03 * yty1
+            final02 = final02 + reg04 * yty2
+            final03 = final03 + reg05 * yty3
+            final04 = final04 + reg06 * yty4
+            final05 = final05 + reg07 * yty5
+            final06 = final06 + reg08 * yty6
+
+            !above second side diagonal
+
+            yty1 = -t(1,4)
+            yty2 = -t(2,5)
+            yty3 = -t(3,6)
+            yty4 = -t(4,7)
+            yty5 = -t(5,8)
+
+            final01 = final01 + reg04 * yty1
+            final02 = final02 + reg05 * yty2
+            final03 = final03 + reg06 * yty3
+            final04 = final04 + reg07 * yty4
+            final05 = final05 + reg08 * yty5
+
+            ! i think you got the idea by now
+
+            yty1 = -t(1,5)
+            yty2 = -t(2,6)
+            yty3 = -t(3,7)
+            yty4 = -t(4,8)
+
+            final01 = final01 + reg05 * yty1
+            final02 = final02 + reg06 * yty2
+            final03 = final03 + reg07 * yty3
+            final04 = final04 + reg08 * yty4
+
+            ! .....
+
+            yty1 = -t(1,6)
+            yty2 = -t(2,7)
+            yty3 = -t(3,8)
+
+            final01 = final01 + reg06 * yty1
+            final02 = final02 + reg07 * yty2
+            final03 = final03 + reg08 * yty3
+
+            ! .....
+
+            yty1 = -t(1,7)
+            yty2 = -t(2,8)
+
+            final01 = final01 + reg07 * yty1
+            final02 = final02 + reg08 * yty2
+
+            ! .....
+
+            yty1 = -t(1,8)
+
+            final01 = final01 + reg08 * yty1
+
+            ! write back to final matrix
+            t(1,leftk+icol) = final01
+            t(2,leftk+icol) = final02
+            t(3,leftk+icol) = final03
+            t(4,leftk+icol) = final04
+            t(5,leftk+icol) = final05
+            t(6,leftk+icol) = final06
+            t(7,leftk+icol) = final07
+            t(8,leftk+icol) = final08
+        end do
+
+        !print *,'efficient tmerge - leftk=8'
+    else
+        ! reference implementation
+        do icol=1,rightk
+            t(1:leftk,leftk+icol) = yty(1:leftk,icol)
+        end do
+#ifdef DOUBLE_PRECISION_REAL
+        ! -T1 * Y1'*Y2
+        call dtrmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0_rk8,t(1,1),ldt,t(1,leftk+1),ldt)
+        ! (-T1 * Y1'*Y2) * T2
+        call dtrmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0_rk8,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt)
+#else
+        ! -T1 * Y1'*Y2
+        call strmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0_rk4,t(1,1),ldt,t(1,leftk+1),ldt)
+        ! (-T1 * Y1'*Y2) * T2
+        call strmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0_rk4,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt)
+
+#endif
+    end if
+
+end subroutine
+
+
+! yty structure
+! Y1'*Y2   Y1'*Y3  Y1'*Y4 ...
+!    0     Y2'*Y3  Y2'*Y4 ...
+!    0        0    Y3'*Y4 ...
+!    0        0       0   ...
+
+subroutine qr_tmerge_set_kernel_&
+&PRECISION &
+(k,blocksize,t,ldt,yty,ldy)
+    use precision
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik) :: ldt,ldy
+    real(kind=REAL_DATATYPE)    :: t(ldt,*),yty(ldy,*)
+
+    ! input variables (global)
+    integer(kind=ik) :: k,blocksize
+
+    ! output variables (global)
+
+    ! local scalars
+    integer(kind=ik) :: nr_blocks,current_block
+    integer(kind=ik) :: remainder,oldk
+    integer(kind=ik) :: yty_column,toffset
+
+    if (k .le. blocksize) return ! nothing to merge
+
+    nr_blocks = k / blocksize
+    remainder = k - nr_blocks*blocksize
+
+        ! work in "negative" direction:
+        ! start with latest T matrix part and add older ones
+        toffset = 1
+        yty_column = 1
+
+        if (remainder .gt. 0) then
+            call qr_pdlarft_merge_kernel_local_&
+      &PRECISION &
+            (blocksize,remainder,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
+            current_block = 1
+            oldk = remainder+blocksize
+            yty_column =  yty_column + blocksize
+        else
+            call qr_pdlarft_merge_kernel_local_&
+      &PRECISION &
+      (blocksize,blocksize,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
+            current_block = 2
+            oldk = 2*blocksize
+            yty_column = yty_column + blocksize
+        end if
+
+        do while (current_block .lt. nr_blocks)
+            call qr_pdlarft_merge_kernel_local_&
+      &PRECISION &
+            (blocksize,oldk,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
+            current_block = current_block + 1
+            oldk = oldk + blocksize
+            yty_column = yty_column + blocksize
+        end do
+
+end subroutine
+! yty structure
+! Y1'*Y2   Y1'*Y3  Y1'*Y4 ...
+!    0     Y2'*Y3  Y2'*Y4 ...
+!    0        0    Y3'*Y4 ...
+!    0        0       0   ...
+
+subroutine qr_tmerge_tree_kernel_&
+&PRECISION &
+(k,blocksize,treeorder,t,ldt,yty,ldy)
+    use precision
+    implicit none
+
+    ! input variables (local)
+    integer(kind=ik) :: ldt,ldy
+    real(kind=REAL_DATATYPE)    :: t(ldt,*),yty(ldy,*)
+
+    ! input variables (global)
+    integer(kind=ik) :: k,blocksize,treeorder
+
+    ! output variables (global)
+
+    ! local scalars
+    integer temp_blocksize,nr_sets,current_set,setsize,nr_blocks
+    integer remainder,max_treeorder,remaining_size
+    integer toffset,yty_column
+    integer toffset_start,yty_column_start
+    integer yty_end,total_remainder,yty_remainder
+
+    if (treeorder .eq. 0) return ! no merging
+
+    if (treeorder .eq. 1) then
+        call qr_tmerge_set_kernel_&
+  &PRECISION &
+  (k,blocksize,t,ldt,yty,ldy)
+        return
+    end if
+
+    nr_blocks = k / blocksize
+    max_treeorder = min(nr_blocks,treeorder)
+
+    if (max_treeorder .eq. 1) then
+        call qr_tmerge_set_kernel_&
+  &PRECISION &
+        (k,blocksize,t,ldt,yty,ldy)
+        return
+    end if
+
+        ! work in "negative" direction: from latest set to oldest set
+        ! implementation differs from rev=0 version due to issues with
+        ! calculating the remainder parts
+        ! compared to the rev=0 version we split remainder parts directly from
+        ! parts which can be easily merged in a recursive way
+
+        yty_end = (k / blocksize) * blocksize
+        if (yty_end .eq. k) then
+            yty_end = yty_end - blocksize
+        end if
+
+        !print *,'tree',yty_end,k,blocksize
+
+        yty_column_start = 1
+        toffset_start = 1
+
+        ! is there a remainder block?
+        nr_blocks = k / blocksize
+        remainder = k - nr_blocks * blocksize
+        if (remainder .eq. 0) then
+            !print *,'no initial remainder'
+
+            ! set offsets to the very beginning as there is no remainder part
+            yty_column_start = 1
+            toffset_start = 1
+            total_remainder = 0
+            remaining_size = k
+            yty_remainder = 0
+        else
+            !print *,'starting with initial remainder'
+            ! select submatrix and make remainder block public
+            yty_column_start = 1 + blocksize
+            toffset_start = 1 + remainder
+            total_remainder = remainder
+            remaining_size = k - remainder
+            yty_remainder = 1
+        end if
+
+        ! from now on it is a clean set of blocks with sizes of multiple of
+        ! blocksize
+
+        temp_blocksize = blocksize
+
+        !-------------------------------
+        do while (remaining_size .gt. 0)
+            nr_blocks = remaining_size / temp_blocksize
+            max_treeorder = min(nr_blocks,treeorder)
+
+            if (max_treeorder .eq. 1) then
+                remainder = 0
+                nr_sets = 0
+                setsize = 0
+
+                if (yty_remainder .gt. 0) then
+                    yty_column = yty_remainder
+                    !print *,'final merging with remainder',temp_blocksize,k,remaining_size,yty_column
+                    call qr_tmerge_set_kernel_&
+        &PRECISION &
+                    (k,temp_blocksize,t,ldt,yty(1,yty_column),ldy)
+                else
+                    !print *,'no remainder - no merging needed',temp_blocksize,k,remaining_size
+                endif
+
+                remaining_size = 0
+
+                return ! done
+            else
+                nr_sets = nr_blocks / max_treeorder
+                setsize = max_treeorder*temp_blocksize
+                remainder = remaining_size - nr_sets*setsize
+            end if
+
+            if (remainder .gt. 0) then
+                if (remainder .gt. temp_blocksize) then
+                    toffset = toffset_start
+                    yty_column = yty_column_start
+
+                    !print *,'set merging', toffset, yty_column,remainder
+                    call qr_tmerge_set_kernel_&
+        &PRECISION &
+                    (remainder,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
+                    if (total_remainder .gt. 0) then
+                        ! merge with existing global remainder part
+                        !print *,'single+set merging',yty_remainder,total_remainder,remainder
+                        call qr_pdlarft_merge_kernel_local_&
+      &PRECISION &
+                        (remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
+                        yty_remainder = yty_remainder + remainder
+                        toffset_start = toffset_start + remainder
+
+                        !print *,'single+set merging (new offsets)',yty_remainder,yty_column_start,toffset_start
+
+                        yty_column_start = yty_column_start + remainder
+                    else
+                        ! create new remainder part
+                        !print *,'new remainder+set',yty_remainder
+                        yty_remainder = yty_column_start + remainder - temp_blocksize
+                        yty_column_start = yty_column_start + remainder
+                        toffset_start = toffset_start + remainder
+                        !print *,'new remainder+set (new offsets)',yty_remainder,yty_column_start,toffset_start
+                    end if
+
+                else
+                    if (total_remainder .gt. 0) then
+                        ! merge with existing global remainder part
+                        !print *,'single merging',yty_remainder,total_remainder,remainder
+                        call qr_pdlarft_merge_kernel_local_&
+      &PRECISION &
+                        (remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
+                        yty_remainder = yty_remainder + remainder
+                        toffset_start = toffset_start + remainder
+
+                        !print *,'single merging (new offsets)',yty_remainder,yty_column_start,toffset_start
+
+                        yty_column_start = yty_column_start + remainder
+                    else
+                        ! create new remainder part
+                        !print *,'new remainder',yty_remainder
+                        yty_remainder = yty_column_start
+                        yty_column_start = yty_column_start + temp_blocksize
+                        toffset_start = toffset_start + remainder
+                        !print *,'new remainder (new offsets)',yty_remainder,yty_column_start,toffset_start
+                    end if
+                end if
+
+                total_remainder = total_remainder + remainder
+                remaining_size = remaining_size - remainder
+            end if
+
+            current_set = 0
+            do while (current_set .lt. nr_sets)
+                toffset = toffset_start + current_set * setsize
+                yty_column = yty_column_start + current_set * setsize
+
+                !print *,'recursive merging', toffset, yty_column,setsize
+                call qr_tmerge_set_kernel_&
+    &PRECISION &
+    (setsize,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
+
+                current_set = current_set +  1
+            end do
+
+            !print *,'increasing blocksize', temp_blocksize, setsize
+            yty_column_start = yty_column_start + (setsize - temp_blocksize)
+            temp_blocksize = setsize
+        end do
+end subroutine
+
+
+! yty should not contain the inner products vi'*vi
+
+subroutine qr_dlarft_kernel_&
+&PRECISION &
+(n,tau,yty,ldy,t,ldt)
+    use precision
+    implicit none
+
+    ! input variables
+    integer(kind=ik) :: n,ldy,ldt
+    real(kind=REAL_DATATYPE)    :: tau(*),yty(ldy,*)
+
+    ! output variables
+    real(kind=REAL_DATATYPE)    :: t(ldt,*)
+
+    ! local variables
+    integer(kind=ik) :: icol
+
+    ! DEBUG: clear buffer first
+    !t(1:n,1:n) = 0.0d0
+
+        ! T1 = tau1
+        ! | tauk  Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' |
+        ! | 0           Tk-1                           |
+        t(n,n) = tau(n)
+        do icol=n-1,1,-1
+            t(icol,icol+1:n) = -tau(icol)*yty(icol,icol:n-1)
+#ifdef DOUBLE_PRECISION_REAL
+            call dtrmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
+#else
+            call strmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
+#endif
+            t(icol,icol) = tau(icol)
+        end do
+end subroutine
+
+! vim: syntax=fortran
diff -Nru elpa-2016.05.001/src/elpa2/qr/qr_utils.F90 elpa-2019.11.001/src/elpa2/qr/qr_utils.F90
--- elpa-2016.05.001/src/elpa2/qr/qr_utils.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/qr/qr_utils.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,87 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+module qr_utils_mod
+  use elpa_mpi
+  use elpa1_compute
+  use elpa_utilities
+  implicit none
+
+  PRIVATE
+
+  public :: local_size_offset_1d
+
+  public :: reverse_vector_local_double
+  public :: reverse_matrix_local_double
+  public :: reverse_matrix_1dcomm_double
+  public :: reverse_matrix_2dcomm_ref_double
+
+#if WANT_SINGLE_PRECISION_REAL
+  public :: reverse_vector_local_single
+  public :: reverse_matrix_local_single
+  public :: reverse_matrix_1dcomm_single
+  public :: reverse_matrix_2dcomm_ref_single
+#endif
+
+  contains
+  ! real double precision
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "qr_utils_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+  ! real single precision
+#define REALCASE 1
+#define ALREADY_DEFINED
+#define SINGLE_PRECISION 1
+#include "../../general/precision_macros.h"
+#include "qr_utils_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+end module
diff -Nru elpa-2016.05.001/src/elpa2/qr/qr_utils_template.F90 elpa-2019.11.001/src/elpa2/qr/qr_utils_template.F90
--- elpa-2016.05.001/src/elpa2/qr/qr_utils_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/qr/qr_utils_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,437 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+#ifndef ALREADY_DEFINED
+  subroutine local_size_offset_1d(n,nb,baseidx,idx,rev,rank,nprocs, &
+                                lsize,baseoffset,offset)
+
+    use precision
+    use ELPA1_compute
+
+    implicit none
+
+    ! input
+    integer(kind=ik) :: n,nb,baseidx,idx,rev,rank,nprocs
+
+    ! output
+    integer(kind=ik) :: lsize,baseoffset,offset
+
+    ! local scalars
+    integer(kind=ik) :: rank_idx
+
+    rank_idx = MOD((idx-1)/nb,nprocs)
+
+    ! calculate local size and offsets
+    if (rev .eq. 1) then
+        if (idx > 0) then
+            lsize = local_index(idx,rank,nprocs,nb,-1)
+        else
+            lsize = 0
+        end if
+
+        baseoffset = 1
+        offset = 1
+    else
+        offset = local_index(idx,rank,nprocs,nb,1)
+        baseoffset = local_index(baseidx,rank,nprocs,nb,1)
+
+        lsize = local_index(n,rank,nprocs,nb,-1)
+        !print *,'baseidx,idx',baseidx,idx,lsize,n
+
+        lsize = lsize - offset + 1
+
+        baseoffset = offset - baseoffset + 1
+    end if
+
+end subroutine local_size_offset_1d
+#endif
+
+subroutine reverse_vector_local_&
+           &PRECISION &
+     (n,x,incx,work,lwork)
+    use precision
+    implicit none
+#include "../../general/precision_kinds.F90"
+
+    ! input
+    integer(kind=ik)              :: incx, n, lwork
+    real(kind=C_DATATYPE_KIND)    :: x(*), work(*)
+
+    ! local scalars
+    real(kind=C_DATATYPE_KIND)    :: temp
+    integer(kind=ik)              :: srcoffset, destoffset, ientry
+
+    if (lwork .eq. -1) then
+        work(1) = 0.0_rk
+        return
+    end if
+
+    do ientry=1,n/2
+        srcoffset=1+(ientry-1)*incx
+        destoffset=1+(n-ientry)*incx
+
+        temp = x(srcoffset)
+        x(srcoffset) = x(destoffset)
+        x(destoffset) = temp
+    end do
+
+end subroutine
+
+subroutine reverse_matrix_local_&
+           &PRECISION &
+     (trans, m, n, a, lda, work, lwork)
+    use precision
+    implicit none
+
+    ! input
+    integer(kind=ik)              :: lda,m,n,lwork,trans
+    real(kind=C_DATATYPE_KIND)    :: a(lda,*),work(*)
+
+    ! local scalars
+    real(kind=C_DATATYPE_KIND)    :: dworksize(1)
+    integer(kind=ik)              :: incx
+    integer(kind=ik)              :: dimsize
+    integer(kind=ik)              :: i
+
+    if (trans .eq. 1) then
+        incx = lda
+        dimsize = n
+    else
+        incx = 1
+        dimsize = m
+    end if
+
+    if (lwork .eq. -1) then
+        call reverse_vector_local_&
+       &PRECISION &
+       (dimsize, a, incx, dworksize, -1)
+        work(1) = dworksize(1)
+        return
+    end if
+
+    if (trans .eq. 1) then
+        do i=1,m
+            call reverse_vector_local_&
+      &PRECISION &
+      (dimsize, a(i,1), incx, work, lwork)
+        end do
+    else
+        do i=1,n
+            call reverse_vector_local_&
+      &PRECISION &
+      (dimsize, a(1,i), incx, work, lwork)
+        end do
+    end if
+
+end subroutine
+
+subroutine reverse_matrix_2dcomm_ref_&
+           &PRECISION &
+     (m, n, mb, nb, a, lda, work, lwork, mpicomm_cols, mpicomm_rows)
+    use precision
+    implicit none
+
+    ! input
+    integer(kind=ik)              :: m, n, lda, lwork, mpicomm_cols, mpicomm_rows, mb, nb
+    real(kind=C_DATATYPE_KIND)    :: a(lda,*),work(*)
+
+    ! local scalars
+    real(kind=C_DATATYPE_KIND)    :: reverse_column_size(1)
+    real(kind=C_DATATYPE_KIND)    :: reverse_row_size(1)
+
+    integer(kind=ik)              :: mpirank_cols, mpirank_rows
+    integer(kind=ik)              :: mpiprocs_cols, mpiprocs_rows
+    integer(kind=MPI_KIND)        :: mpirank_colsMPI, mpirank_rowsMPI
+    integer(kind=MPI_KIND)        :: mpiprocs_colsMPI, mpiprocs_rowsMPI
+    integer(kind=MPI_KIND)        :: mpierr
+    integer(kind=ik)              :: lrows, lcols, offset, baseoffset
+
+    call MPI_Comm_rank(int(mpicomm_cols,kind=MPI_KIND) ,mpirank_colsMPI,  mpierr)
+    call MPI_Comm_rank(int(mpicomm_rows,kind=MPI_KIND) ,mpirank_rowsMPI,  mpierr)
+    call MPI_Comm_size(int(mpicomm_cols,kind=MPI_KIND) ,mpiprocs_colsMPI, mpierr)
+    call MPI_Comm_size(int(mpicomm_rows,kind=MPI_KIND) ,mpiprocs_rowsMPI, mpierr)
+
+    mpirank_cols = int(mpirank_colsMPI,kind=c_int)
+    mpirank_rows = int(mpirank_rowsMPI,kind=c_int)
+    mpiprocs_cols = int(mpiprocs_colsMPI,kind=c_int)
+    mpiprocs_rows = int(mpiprocs_rowsMPI,kind=c_int)
+
+    call local_size_offset_1d(m,mb,1,1,0,mpirank_cols,mpiprocs_cols, &
+                                  lrows,baseoffset,offset)
+
+    call local_size_offset_1d(n,nb,1,1,0,mpirank_rows,mpiprocs_rows, &
+                                  lcols,baseoffset,offset)
+
+    if (lwork .eq. -1) then
+        call reverse_matrix_1dcomm_&
+  &PRECISION &
+  (0,m,lcols,mb,a,lda,reverse_column_size,-1,mpicomm_cols)
+        call reverse_matrix_1dcomm_&
+  &PRECISION &
+  (1,lrows,n,nb,a,lda,reverse_row_size,-1,mpicomm_rows)
+        work(1) = max(reverse_column_size(1),reverse_row_size(1))
+        return
+    end if
+
+    call reverse_matrix_1dcomm_&
+    &PRECISION &
+    (0,m,lcols,mb,a,lda,work,lwork,mpicomm_cols)
+    call reverse_matrix_1dcomm_&
+    &PRECISION &
+    (1,lrows,n,nb,a,lda,work,lwork,mpicomm_rows)
+end subroutine
+
+! b: if trans = 'N': b is size of block distribution between rows
+! b: if trans = 'T': b is size of block distribution between columns
+subroutine reverse_matrix_1dcomm_&
+           &PRECISION &
+     (trans, m, n, b, a, lda, work, lwork, mpicomm)
+    use precision
+    use elpa_mpi
+
+    implicit none
+
+    ! input
+    integer(kind=ik)              :: trans
+    integer(kind=ik)              :: m, n, b, lda, lwork, mpicomm
+    real(kind=C_DATATYPE_KIND)    :: a(lda,*), work(*)
+
+    ! local scalars
+    integer(kind=ik)              :: mpirank, mpiprocs
+    integer(kind=MPI_KIND)        :: mpirankMPI, mpiprocsMPI, mpierr
+#ifdef WITH_MPI
+    integer(kind=ik)              :: my_mpistatus(MPI_STATUS_SIZE)
+#endif
+    integer(kind=ik)              :: nr_blocks,dest_process,src_process,step
+    integer(kind=ik)              :: lsize,baseoffset,offset
+    integer(kind=ik)              :: current_index,destblk,srcblk,icol,next_index
+    integer(kind=ik)              :: sendcount,recvcount
+    integer(kind=ik)              :: sendoffset,recvoffset
+    integer(kind=ik)              :: newmatrix_offset,work_offset
+    integer(kind=ik)              :: lcols,lrows,lroffset,lcoffset,dimsize,fixedsize
+    real(kind=C_DATATYPE_KIND)    :: dworksize(1)
+
+    call MPI_Comm_rank(int(mpicomm,kind=MPI_KIND), mpirankMPI, mpierr)
+    call MPI_Comm_size(int(mpicomm,kind=MPI_KIND), mpiprocsMPI, mpierr)
+
+    mpirank = int(mpirankMPI,kind=c_int)
+    mpiprocs = int(mpiprocsMPI,kind=c_int)
+
+    if (trans .eq. 1) then
+        call local_size_offset_1d(n,b,1,1,0,mpirank,mpiprocs, &
+                                  lcols,baseoffset,lcoffset)
+        lrows = m
+    else
+        call local_size_offset_1d(m,b,1,1,0,mpirank,mpiprocs, &
+                                  lrows,baseoffset,lroffset)
+        lcols = n
+    end if
+
+    if (lwork .eq. -1) then
+        call reverse_matrix_local_&
+  &PRECISION &
+  (trans,lrows,lcols,a,max(lrows,lcols),dworksize,-1)
+        work(1) = real(3*lrows*lcols,kind=REAL_DATATYPE) + dworksize(1)
+        return
+    end if
+
+    sendoffset = 1
+    recvoffset = sendoffset + lrows*lcols
+    newmatrix_offset = recvoffset + lrows*lcols
+    work_offset = newmatrix_offset + lrows*lcols
+
+    if (trans .eq. 1) then
+        dimsize = n
+        fixedsize = m
+    else
+        dimsize = m
+        fixedsize = n
+    end if
+
+    if (dimsize .le. 1) then
+        return ! nothing to do
+    end if
+
+    ! 1. adjust step size to remainder size
+    nr_blocks = dimsize / b
+    nr_blocks = nr_blocks * b
+    step = dimsize - nr_blocks
+    if (step .eq. 0) step = b
+
+    ! 2. iterate over destination blocks starting with process 0
+    current_index = 1
+    do while (current_index .le. dimsize)
+        destblk = (current_index-1) / b
+        dest_process = mod(destblk,mpiprocs)
+        srcblk = (dimsize-current_index) / b
+        src_process = mod(srcblk,mpiprocs)
+
+        next_index = current_index+step
+
+        ! block for dest_process is located on mpirank if lsize > 0
+        call local_size_offset_1d(dimsize-current_index+1,b,dimsize-next_index+2,dimsize-next_index+2,0, &
+                                  src_process,mpiprocs,lsize,baseoffset,offset)
+
+        sendcount = lsize*fixedsize
+        recvcount = sendcount
+
+        ! TODO: this send/recv stuff seems to blow up on BlueGene/P
+        ! TODO: is there actually room for the requested matrix part? the target
+        ! process might not have any parts at all (thus no room)
+        if ((src_process .eq. mpirank) .and. (dest_process .eq. src_process)) then
+                ! 5. pack data
+                if (trans .eq. 1) then
+                    do icol=offset,offset+lsize-1
+                        work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = &
+                            a(1:lrows,icol)
+                    end do
+                else
+                    do icol=1,lcols
+                        work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = &
+                            a(offset:offset+lsize-1,icol)
+                    end do
+                end if
+
+                ! 7. reverse data
+                if (trans .eq. 1) then
+                    call reverse_matrix_local_&
+        &PRECISION &
+        (1,lrows,lsize,work(sendoffset),lrows,work(work_offset),lwork)
+                else
+                    call reverse_matrix_local_&
+        &PRECISION &
+        (0,lsize,lcols,work(sendoffset),lsize,work(work_offset),lwork)
+                end if
+
+                ! 8. store in temp matrix
+                if (trans .eq. 1) then
+                    do icol=1,lsize
+                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = &
+                            work(sendoffset+(icol-1)*lrows:sendoffset+icol*lrows-1)
+                    end do
+
+                    newmatrix_offset = newmatrix_offset + lsize*lrows
+                else
+                    do icol=1,lcols
+                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = &
+                            work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1)
+                    end do
+
+                    newmatrix_offset = newmatrix_offset + lsize
+                end if
+        else
+
+            if (dest_process .eq. mpirank) then
+                ! 6b. call MPI_Recv
+
+#ifdef WITH_MPI
+                call MPI_Recv(work(recvoffset), int(recvcount,kind=MPI_KIND), MPI_REAL_PRECISION, &
+                              int(src_process,kind=MPI_KIND), int(current_index,kind=MPI_KIND),   &
+                              int(mpicomm,kind=MPI_KIND), my_mpistatus, mpierr)
+
+#else /* WITH_MPI */
+                work(recvoffset:recvoffset+recvcount-1) = work(sendoffset:sendoffset+sendcount-1)
+#endif /* WITH_MPI */
+
+                ! 7. reverse data
+                if (trans .eq. 1) then
+                    call reverse_matrix_local_&
+        &PRECISION &
+        (1,lrows,lsize,work(recvoffset),lrows,work(work_offset),lwork)
+                else
+                    call reverse_matrix_local_&
+        &PRECISION &
+        (0,lsize,lcols,work(recvoffset),lsize,work(work_offset),lwork)
+                end if
+
+                ! 8. store in temp matrix
+                if (trans .eq. 1) then
+                    do icol=1,lsize
+                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = &
+                            work(recvoffset+(icol-1)*lrows:recvoffset+icol*lrows-1)
+                    end do
+
+                    newmatrix_offset = newmatrix_offset + lsize*lrows
+                else
+                    do icol=1,lcols
+                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = &
+                            work(recvoffset+(icol-1)*lsize:recvoffset+icol*lsize-1)
+                    end do
+
+                    newmatrix_offset = newmatrix_offset + lsize
+                end if
+            end if
+
+            if (src_process .eq. mpirank) then
+                ! 5. pack data
+                if (trans .eq. 1) then
+                    do icol=offset,offset+lsize-1
+                        work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = &
+                            a(1:lrows,icol)
+                    end do
+                else
+                    do icol=1,lcols
+                        work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = &
+                            a(offset:offset+lsize-1,icol)
+                    end do
+                end if
+
+                ! 6a. call MPI_Send
+#ifdef WITH_MPI
+                call MPI_Send(work(sendoffset), int(sendcount,kind=MPI_KIND), MPI_REAL_PRECISION, &
+                              int(dest_process,kind=MPI_KIND), int(current_index,kind=MPI_KIND),  &
+                              int(mpicomm,kind=MPI_KIND), mpierr)
+#endif /* WITH_MPI */
+            end if
+        end if
+
+        current_index = next_index
+    end do
+
+   ! 9. copy temp matrix to real matrix
+   newmatrix_offset = recvoffset + lrows*lcols
+   do icol=1,lcols
+        a(1:lrows,icol) = &
+            work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1)
+   end do
+end subroutine
diff -Nru elpa-2016.05.001/src/elpa2/redist_band.F90 elpa-2019.11.001/src/elpa2/redist_band.F90
--- elpa-2016.05.001/src/elpa2/redist_band.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa2/redist_band.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,293 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: Andreas Marek, MPCDF
+#endif
+! --------------------------------------------------------------------------------------------------
+! redist_band: redistributes band from 2D block cyclic form to 1D band
+
+#include "config-f90.h"
+
+subroutine redist_band_&
+&MATH_DATATYPE&
+&_&
+&PRECISION &
+           (obj, a_mat, a_dev, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
+
+   use elpa_abstract_impl
+   use elpa2_workload
+   use precision
+   use iso_c_binding
+   use cuda_functions
+   use elpa_utilities, only : local_index
+   use elpa_mpi
+   implicit none
+
+   class(elpa_abstract_impl_t), intent(inout)       :: obj
+   logical, intent(in)                              :: useGPU
+   integer(kind=ik), intent(in)                     :: lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(in)  :: a_mat(lda, matrixCols)
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(out) :: ab(:,:)
+
+   integer(kind=ik), allocatable                    :: ncnt_s(:), nstart_s(:), ncnt_r(:), nstart_r(:), &
+                                                       block_limits(:)
+   integer(kind=ik), allocatable                    :: global_id(:,:), global_id_tmp(:,:)
+   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: sbuf(:,:,:), rbuf(:,:,:), buf(:,:)
+
+   integer(kind=ik)                                 :: i, j, my_pe, n_pes, my_prow, np_rows, my_pcol, np_cols, &
+                                                       nfact, np, npr, npc, is, js
+   integer(kind=MPI_KIND)                           :: my_peMPI, n_pesMPI, my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
+   integer(kind=MPI_KIND)                           :: mpierr
+   integer(kind=ik)                                 :: nblocks_total, il, jl, l_rows, l_cols, n_off
+
+   logical                                          :: successCUDA
+   integer(kind=c_intptr_t)                         :: a_dev
+   integer(kind=c_intptr_t), parameter              :: size_of_datatype = size_of_&
+                                                                        &PRECISION&
+                                                                        &_&
+                                                                        &MATH_DATATYPE
+
+   call obj%timer%start("redist_band_&
+   &MATH_DATATYPE&
+   &" // &
+   &PRECISION_SUFFIX &
+   )
+
+
+   call obj%timer%start("mpi_communication")
+   call mpi_comm_rank(int(communicator,kind=MPI_KIND), my_peMPI, mpierr)
+   call mpi_comm_size(int(communicator,kind=MPI_KIND), n_pesMPI, mpierr)
+
+   call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
+   call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
+   call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
+   call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
+
+   my_pe = int(my_peMPI,kind=c_int)
+   n_pes = int(n_pesMPI,kind=c_int)
+   my_prow = int(my_prowMPI,kind=c_int)
+   np_rows = int(np_rowsMPI,kind=c_int)
+   my_pcol = int(my_pcolMPI,kind=c_int)
+   np_cols = int(np_colsMPI,kind=c_int)
+
+   call obj%timer%stop("mpi_communication")
+
+   ! Get global_id mapping 2D procssor coordinates to global id
+   
+   allocate(global_id(0:np_rows-1,0:np_cols-1))
+#ifdef WITH_OPENMP
+   allocate(global_id_tmp(0:np_rows-1,0:np_cols-1))
+#endif
+   global_id(:,:) = 0
+   global_id(my_prow, my_pcol) = my_pe
+#ifdef WITH_MPI
+   call obj%timer%start("mpi_communication")
+#ifdef WITH_OPENMP
+   global_id_tmp(:,:) = global_id(:,:)
+   call mpi_allreduce(global_id_tmp, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, mpi_sum, &
+                      int(communicator,kind=MPI_KIND), mpierr)
+   deallocate(global_id_tmp)
+#else
+   call mpi_allreduce(mpi_in_place, global_id, int(np_rows*np_cols,kind=MPI_KIND), mpi_integer, mpi_sum, &
+                      int(communicator,kind=MPI_KIND), mpierr)
+#endif
+   call obj%timer%stop("mpi_communication")
+#endif /* WITH_MPI */
+   ! Set work distribution
+
+   nblocks_total = (na-1)/nbw + 1
+
+   allocate(block_limits(0:n_pes))
+   call divide_band(obj, nblocks_total, n_pes, block_limits)
+
+
+   allocate(ncnt_s(0:n_pes-1))
+   allocate(nstart_s(0:n_pes-1))
+   allocate(ncnt_r(0:n_pes-1))
+   allocate(nstart_r(0:n_pes-1))
+
+
+   nfact = nbw/nblk
+
+   ! Count how many blocks go to which PE
+
+   ncnt_s(:) = 0
+   np = 0 ! receiver PE number
+   do j=0,(na-1)/nblk ! loop over rows of blocks
+     if (j/nfact==block_limits(np+1)) np = np+1
+     if (mod(j,np_rows) == my_prow) then
+       do i=0,nfact
+         if (mod(i+j,np_cols) == my_pcol) then
+           ncnt_s(np) = ncnt_s(np) + 1
+         endif
+       enddo
+     endif
+   enddo
+
+   ! Allocate send buffer
+
+   allocate(sbuf(nblk,nblk,sum(ncnt_s)))
+   sbuf(:,:,:) = 0.
+
+   ! Determine start offsets in send buffer
+
+   nstart_s(0) = 0
+   do i=1,n_pes-1
+     nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1)
+   enddo
+
+   ! Fill send buffer
+
+   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
+   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of a_mat
+
+   np = 0
+   do j=0,(na-1)/nblk ! loop over rows of blocks
+     if (j/nfact==block_limits(np+1)) np = np+1
+     if (mod(j,np_rows) == my_prow) then
+       do i=0,nfact
+         if (mod(i+j,np_cols) == my_pcol) then
+           nstart_s(np) = nstart_s(np) + 1
+           js = (j/np_rows)*nblk
+           is = ((i+j)/np_cols)*nblk
+           jl = MIN(nblk,l_rows-js)
+           il = MIN(nblk,l_cols-is)
+
+           sbuf(1:jl,1:il,nstart_s(np)) = a_mat(js+1:js+jl,is+1:is+il)
+         endif
+       enddo
+      endif
+   enddo
+
+   ! Count how many blocks we get from which PE
+
+   ncnt_r(:) = 0
+   do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk)
+     npr = mod(j,np_rows)
+     do i=0,nfact
+       npc = mod(i+j,np_cols)
+       np = global_id(npr, npc)
+       ncnt_r(np) = ncnt_r(np) + 1
+     enddo
+   enddo
+
+   ! Allocate receive buffer
+
+   allocate(rbuf(nblk,nblk,sum(ncnt_r)))
+
+   ! Set send counts/send offsets, receive counts/receive offsets
+   ! now actually in variables, not in blocks
+
+   ncnt_s(:) = ncnt_s(:)*nblk*nblk
+
+   nstart_s(0) = 0
+   do i=1,n_pes-1
+     nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1)
+   enddo
+
+   ncnt_r(:) = ncnt_r(:)*nblk*nblk
+
+   nstart_r(0) = 0
+   do i=1,n_pes-1
+     nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1)
+   enddo
+
+   ! Exchange all data with MPI_Alltoallv
+#ifdef WITH_MPI
+   call obj%timer%start("mpi_communication")
+
+    call MPI_Alltoallv(sbuf, int(ncnt_s,kind=MPI_KIND), int(nstart_s,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                       rbuf, int(ncnt_r,kind=MPI_KIND), int(nstart_r,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
+                       int(communicator,kind=MPI_KIND), mpierr)
+
+    call obj%timer%stop("mpi_communication")
+#else /* WITH_MPI */
+    rbuf = sbuf
+#endif /* WITH_MPI */
+
+! set band from receive buffer
+
+   ncnt_r(:) = ncnt_r(:)/(nblk*nblk)
+
+   nstart_r(0) = 0
+   do i=1,n_pes-1
+     nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1)
+   enddo
+
+   allocate(buf((nfact+1)*nblk,nblk))
+
+   ! n_off: Offset of ab within band
+   n_off = block_limits(my_pe)*nbw
+
+   do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk)
+     npr = mod(j,np_rows)
+     do i=0,nfact
+       npc = mod(i+j,np_cols)
+       np = global_id(npr, npc)
+       nstart_r(np) = nstart_r(np) + 1
+#if REALCASE==1
+       buf(i*nblk+1:i*nblk+nblk,:) = transpose(rbuf(:,:,nstart_r(np)))
+#endif
+#if COMPLEXCASE==1
+       buf(i*nblk+1:i*nblk+nblk,:) = conjg(transpose(rbuf(:,:,nstart_r(np))))
+#endif
+     enddo
+     do i=1,MIN(nblk,na-j*nblk)
+       ab(1:nbw+1,i+j*nblk-n_off) = buf(i:i+nbw,i)
+     enddo
+   enddo
+
+   deallocate(ncnt_s, nstart_s)
+   deallocate(ncnt_r, nstart_r)
+   deallocate(global_id)
+   deallocate(block_limits)
+
+   deallocate(sbuf, rbuf, buf)
+
+   call obj%timer%stop("redist_band_&
+   &MATH_DATATYPE&
+   &" // &
+   &PRECISION_SUFFIX &
+   )
+
+end subroutine
+
diff -Nru elpa-2016.05.001/src/elpa2_compute.F90 elpa-2019.11.001/src/elpa2_compute.F90
--- elpa-2016.05.001/src/elpa2_compute.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_compute.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,5958 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!    This particular source code file contains additions, changes and
-!    enhancements authored by Intel Corporation which is not part of
-!    the ELPA consortium.
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-#include "config-f90.h"
-
-module ELPA2_compute
-
-! Version 1.1.2, 2011-02-21
-
-  use elpa_utilities
-  USE ELPA1_compute
-  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
-  use elpa2_utilities
-  use elpa_pdgeqrf
-  use elpa_mpi
-  use aligned_mem
-
-  implicit none
-
-  PRIVATE ! By default, all routines contained are private
-
-  public :: bandred_real
-  public :: tridiag_band_real
-  public :: trans_ev_tridi_to_band_real
-  public :: trans_ev_band_to_full_real
-
-  public :: bandred_complex
-  public :: tridiag_band_complex
-  public :: trans_ev_tridi_to_band_complex
-  public :: trans_ev_band_to_full_complex
-
-  public :: band_band_real
-  public :: divide_band
-
-  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
-                                                    ! 0 for unblocked
-                                                    ! 1 for blocked (maxrank: nblk)
-
-  contains
-
-    subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
-                            tmat, wantDebug, success, useQR)
-
-    !-------------------------------------------------------------------------------
-    !  bandred_real: Reduces a distributed symmetric matrix to band form
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be reduced.
-    !              Distribution is like in Scalapack.
-    !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
-    !              a(:,:) is overwritten on exit with the band and the Householder vectors
-    !              in the upper half.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nbw         semi bandwith of output matrix
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
-    !              Factors for the Householder vectors (returned), needed for back transformation
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-#ifdef WITH_OPENMP
-      use omp_lib
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)           :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)              :: a(lda,*), tmat(nbw,nbw,*)
-#else
-      real(kind=rk)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
-#endif
-      integer(kind=ik)           :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)           :: l_cols, l_rows, vmrCols
-      integer(kind=ik)           :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
-      integer(kind=ik)           :: istep, ncol, lch, lcx, nlc, mynlc
-      integer(kind=ik)           :: tile_size, l_rows_tile, l_cols_tile
-
-      real(kind=rk)              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
-
-      real(kind=rk), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
-
-      ! needed for blocked QR decomposition
-      integer(kind=ik)           :: PQRPARAM(11), work_size
-      real(kind=rk)              :: dwork_size(1)
-      real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
-
-      logical, intent(in)        :: wantDebug
-      logical, intent(out)       :: success
-
-      logical, intent(in)        :: useQR
-
-      integer(kind=ik)           :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("bandred_real")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      success = .true.
-
-
-      ! Semibandwith nbw must be a multiple of blocksize nblk
-      if (mod(nbw,nblk)/=0) then
-        if (my_prow==0 .and. my_pcol==0) then
-          if (wantDebug) then
-            write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
-            write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
-          endif
-          success = .false.
-          return
-        endif
-      endif
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-      if (useQR) then
-        if (which_qr_decomposition == 1) then
-          call qr_pqrparam_init(pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
-          allocate(tauvector(na))
-          allocate(blockheuristic(nblk))
-          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
-          allocate(vmr(max(l_rows,1),na))
-
-          vmrCols = na
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-          call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmr, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
-                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
-                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
-
-#else
-          call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), matrixCols, lda, vmr(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
-                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
-                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
-                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
-#endif
-          work_size = dwork_size(1)
-          allocate(work_blocked(work_size))
-
-          work_blocked = 0.0d0
-          deallocate(vmr)
-        endif
-      endif
-
-      do istep = (na-1)/nbw, 1, -1
-
-        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
-
-        ! Number of local columns/rows of remaining matrix
-        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
-        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
-
-        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
-
-        allocate(vmr(max(l_rows,1),2*n_cols))
-        allocate(umc(max(l_cols,1),2*n_cols))
-
-        allocate(vr(l_rows+1))
-
-        vmr(1:l_rows,1:n_cols) = 0.
-        vr(:) = 0
-        tmat(:,:,istep) = 0
-
-        ! Reduce current block to lower triangular form
-
-        if (useQR) then
-          if (which_qr_decomposition == 1) then
-
-            vmrCols = 2*n_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-            call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmr, max(l_rows,1), vmrCols, tauvector(1), &
-                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
-                                     work_size, na, n_cols, nblk, nblk,        &
-                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
-                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
-                                     blockheuristic)
-
-#else
-            call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), lda, matrixCols, vmr(1:max(l_rows,1),1:vmrCols) ,   &
-                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
-                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
-                                     work_size, na, n_cols, nblk, nblk,        &
-                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
-                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
-                                     blockheuristic)
-#endif
-          endif
-        else
-
-          do lc = n_cols, 1, -1
-
-            ncol = istep*nbw + lc ! absolute column number of householder vector
-            nrow = ncol - nbw ! Absolute number of pivot row
-
-            lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
-            lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
-
-            tau = 0
-
-            if (nrow == 1) exit ! Nothing to do
-
-            cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
-
-            if (my_pcol==cur_pcol) then
-
-              ! Get vector to be transformed; distribute last element and norm of
-              ! remaining elements to all procs in current column
-
-              vr(1:lr) = a(1:lr,lch) ! vector to be transformed
-
-              if (my_prow==prow(nrow, nblk, np_rows)) then
-                aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
-                aux1(2) = vr(lr)
-              else
-                aux1(1) = dot_product(vr(1:lr),vr(1:lr))
-                aux1(2) = 0.
-              endif
-#ifdef WITH_MPI
-              call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-              aux2 = aux1
-#endif
-              vnorm2 = aux2(1)
-              vrl    = aux2(2)
-
-              ! Householder transformation
-
-              call hh_transform_real(vrl, vnorm2, xf, tau)
-
-              ! Scale vr and store Householder vector for back transformation
-
-              vr(1:lr) = vr(1:lr) * xf
-              if (my_prow==prow(nrow, nblk, np_rows)) then
-                a(1:lr-1,lch) = vr(1:lr-1)
-                a(lr,lch) = vrl
-                vr(lr) = 1.
-              else
-                a(1:lr,lch) = vr(1:lr)
-              endif
-
-            endif
-
-            ! Broadcast Householder vector and tau along columns
-
-            vr(lr+1) = tau
-#ifdef WITH_MPI
-            call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
-#endif
-            vmr(1:lr,lc) = vr(1:lr)
-            tau = vr(lr+1)
-            tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
-
-            ! Transform remaining columns in current block with Householder vector
-            ! Local dot product
-
-            aux1 = 0
-#ifdef WITH_OPENMP
-            !Open up one omp region to avoid paying openmp overhead.
-            !This does not help performance due to the addition of two openmp barriers around the MPI call,
-            !But in the future this may be beneficial if these barriers are replaced with a faster implementation
-
-            !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
-            mynlc = 0 ! number of local columns
-
-            !This loop does not have independent iterations,
-            !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
-            !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
-            !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
-            !the iteration number modulo the number of threads
-            do j=1,lc-1
-              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-              if (lcx>0 ) then
-                mynlc = mynlc+1
-                if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
-                    if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
-                endif
-              endif
-            enddo
-
-            ! Get global dot products
-            !$omp barrier
-            !$omp single
-#ifdef WITH_MPI
-            if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-            if (mynlc>0) aux2 = aux1
-#endif
-            !$omp end single
-            !$omp barrier
-
-            ! Transform
-            transformChunkSize=32
-            mynlc = 0
-            do j=1,lc-1
-              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-              if (lcx>0) then
-                mynlc = mynlc+1
-                !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
-                !However, for some reason this is slower than doing it manually, so it is parallelized as below.
-                do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
-                   do pp = 1,transformChunkSize
-                       if (pp + ii > lr) exit
-                           a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
-                   enddo
-                enddo
-              endif
-            enddo
-            !$omp end parallel
-#else /* WITH_OPENMP */
-            nlc = 0 ! number of local columns
-            do j=1,lc-1
-              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-              if (lcx>0) then
-                nlc = nlc+1
-                if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
-              endif
-            enddo
-
-            ! Get global dot products
-#ifdef WITH_MPI
-            if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-            if (nlc>0) aux2=aux1
-#endif
-            ! Transform
-
-            nlc = 0
-            do j=1,lc-1
-              lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-              if (lcx>0) then
-                nlc = nlc+1
-                a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
-              endif
-            enddo
-#endif /* WITH_OPENMP */
-
-          enddo
-
-          ! Calculate scalar products of stored Householder vectors.
-          ! This can be done in different ways, we use dsyrk
-
-          vav = 0
-          if (l_rows>0) &
-              call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1))
-          call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
-
-          ! Calculate triangular matrix T for block Householder Transformation
-
-          do lc=n_cols,1,-1
-            tau = tmat(lc,lc,istep)
-            if (lc<n_cols) then
-              call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
-              tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
-            endif
-          enddo
-        endif
-
-        ! Transpose vmr -> vmc (stored in umc, second half)
-
-        call elpa_transpose_vectors_real  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
-                                        umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
-                                        1, istep*nbw, n_cols, nblk)
-
-        ! Calculate umc = A**T * vmr
-        ! Note that the distributed A has to be transposed
-        ! Opposed to direct tridiagonalization there is no need to use the cache locality
-        ! of the tiles, so we can use strips of the matrix
-        !Code for Algorithm 4
-
-        n_way = 1
-#ifdef WITH_OPENMP
-        n_way = omp_get_max_threads()
-#endif
-        !umc(1:l_cols,1:n_cols) = 0.d0
-        !vmr(1:l_rows,n_cols+1:2*n_cols) = 0
-#ifdef WITH_OPENMP
-        !$omp parallel private( i,lcs,lce,lrs,lre)
-#endif
-        if (n_way > 1) then
-          !$omp do
-          do i=1,min(l_cols_tile, l_cols)
-            umc(i,1:n_cols) = 0.d0
-          enddo
-          !$omp do
-          do i=1,l_rows
-            vmr(i,n_cols+1:2*n_cols) = 0.d0
-          enddo
-          if (l_cols>0 .and. l_rows>0) then
-
-            !SYMM variant 4
-            !Partitioned Matrix Expression:
-            ! Ct = Atl Bt + Atr Bb
-            ! Cb = Atr' Bt + Abl Bb
-            !
-            !Loop invariant:
-            ! Ct = Atl Bt + Atr Bb
-            !
-            !Update:
-            ! C1 = A10'B0 + A11B1 + A21 B2
-            !
-            !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
-            !is easily parallelized, and regardless of choise of algorithm,
-            !the startup cost for parallelizing the dgemms inside the loop is too great
-
-            !$omp do schedule(static,1)
-            do i=0,(istep*nbw-1)/tile_size
-              lcs = i*l_cols_tile+1                   ! local column start
-              lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end
-
-              lrs = i*l_rows_tile+1                   ! local row start
-              lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end
-
-              !C1 += [A11 A12] [B1
-              !                 B2]
-              if( lre > lrs .and. l_cols > lcs ) then
-              call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1,    &
-                         1.d0, a(lrs,lcs), ubound(a,dim=1),           &
-                               umc(lcs,n_cols+1), ubound(umc,dim=1),  &
-                         0.d0, vmr(lrs,n_cols+1), ubound(vmr,dim=1))
-              endif
-
-              ! C1 += A10' B0
-              if( lce > lcs .and. i > 0 ) then
-              call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1,           &
-                         1.d0, a(1,lcs),   ubound(a,dim=1),           &
-                               vmr(1,1),   ubound(vmr,dim=1),         &
-                         0.d0, umc(lcs,1), ubound(umc,dim=1))
-              endif
-            enddo
-          endif
-        else
-          umc(1:l_cols,1:n_cols) = 0.d0
-          vmr(1:l_rows,n_cols+1:2*n_cols) = 0
-          if (l_cols>0 .and. l_rows>0) then
-            do i=0,(istep*nbw-1)/tile_size
-
-              lcs = i*l_cols_tile+1
-              lce = min(l_cols,(i+1)*l_cols_tile)
-              if (lce<lcs) cycle
-
-              lre = min(l_rows,(i+1)*l_rows_tile)
-              call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
-                           vmr,ubound(vmr,dim=1),1.d0,umc(lcs,1),ubound(umc,dim=1))
-
-              if (i==0) cycle
-              lre = min(l_rows,i*l_rows_tile)
-              call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
-                           umc(lcs,n_cols+1),ubound(umc,dim=1),1.d0,vmr(1,n_cols+1),ubound(vmr,dim=1))
-            enddo
-          endif
-        endif
-#ifdef WITH_OPENMP
-        !$omp end parallel
-#endif
-        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
-        ! on the processors containing the diagonal
-        ! This is only necessary if ur has been calculated, i.e. if the
-        ! global tile size is smaller than the global remaining matrix
-        ! Or if we used the Algorithm 4
-        if (tile_size < istep*nbw .or. n_way > 1) then
-        call elpa_reduce_add_vectors_real  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
-                                            umc, ubound(umc,dim=1), mpi_comm_cols, &
-                                            istep*nbw, n_cols, nblk)
-        endif
-#ifdef WITH_MPI
-        if (l_cols>0) then
-          allocate(tmp(l_cols,n_cols))
-          call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-          umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
-          deallocate(tmp)
-        endif
-#endif
-        ! U = U * Tmat**T
-
-        call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
-
-        ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
-
-        call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
-                   ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1))
-        call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),    &
-                   ubound(tmat,dim=1),vav,ubound(vav,dim=1))
-
-        call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
-
-        ! U = U - 0.5 * V * VAV
-        call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, &
-                    ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1))
-
-        ! Transpose umc -> umr (stored in vmr, second half)
-
-        call elpa_transpose_vectors_real  (umc, ubound(umc,dim=1), mpi_comm_cols, &
-                                       vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
-                                       1, istep*nbw, n_cols, nblk)
-
-        ! A = A - V*U**T - U*V**T
-#ifdef WITH_OPENMP
-        !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend  )
-        n_threads = omp_get_num_threads()
-        if (mod(n_threads, 2) == 0) then
-            n_way = 2
-        else
-            n_way = 1
-        endif
-
-        m_way = n_threads / n_way
-
-        m_id = mod(omp_get_thread_num(),  m_way)
-        n_id = omp_get_thread_num() / m_way
-
-        do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
-          i = ii / tile_size
-          lcs = i*l_cols_tile+1
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<1) cycle
-
-          !Figure out this thread's range
-          work_per_thread = lre / m_way
-          if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
-          mystart = m_id * work_per_thread + 1
-          myend   = mystart + work_per_thread - 1
-          if ( myend > lre ) myend = lre
-          if ( myend-mystart+1 < 1) cycle
-
-          call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, &
-                      vmr(mystart, 1), ubound(vmr,1), umc(lcs,1), ubound(umc,1), &
-                      1.d0,a(mystart,lcs),ubound(a,1))
-        enddo
-        !$omp end parallel
-
-#else /* WITH_OPENMP */
-        do i=0,(istep*nbw-1)/tile_size
-          lcs = i*l_cols_tile+1
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<1) cycle
-          call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
-                      vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
-                      1.d0,a(1,lcs),lda)
-        enddo
-#endif /* WITH_OPENMP */
-        deallocate(vmr, umc, vr)
-
-      enddo
-
-      if (useQR) then
-        if (which_qr_decomposition == 1) then
-          deallocate(work_blocked)
-          deallocate(tauvector)
-        endif
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("bandred_real")
-#endif
-    end subroutine bandred_real
-
-    subroutine symm_matrix_allreduce(n,a,lda,ldb,comm)
-
-    !-------------------------------------------------------------------------------
-    !  symm_matrix_allreduce: Does an mpi_allreduce for a symmetric matrix A.
-    !  On entry, only the upper half of A needs to be set
-    !  On exit, the complete matrix is set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik)  :: n, lda, ldb, comm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)     :: a(lda,*)
-#else
-      real(kind=rk)     :: a(lda,ldb)
-#endif
-      integer(kind=ik)  :: i, nc, mpierr
-      real(kind=rk)     :: h1(n*n), h2(n*n)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("symm_matrix_allreduce")
-#endif
-
-      nc = 0
-      do i=1,n
-        h1(nc+1:nc+i) = a(1:i,i)
-        nc = nc+i
-      enddo
-#ifdef WITH_MPI
-      call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,comm,mpierr)
-#else
-      h2=h1
-#endif
-      nc = 0
-      do i=1,n
-        a(1:i,i) = h2(nc+1:nc+i)
-        a(i,1:i-1) = a(1:i-1,i)
-        nc = nc+i
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("symm_matrix_allreduce")
-#endif
-
-    end subroutine symm_matrix_allreduce
-
-    subroutine trans_ev_band_to_full_real(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, mpi_comm_rows, &
-                                      mpi_comm_cols, useQR)
-    !-------------------------------------------------------------------------------
-    !  trans_ev_band_to_full_real:
-    !  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nqc         Number of columns of matrix q
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nbw         semi bandwith
-    !
-    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after bandred_real)
-    !              Distribution is like in Scalapack.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a and q
-    !
-    !  tmat(nbw,nbw,numBlocks) Factors returned by bandred_real
-    !
-    !  q           On input: Eigenvectors of band matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)            :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)               :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
-#else
-      real(kind=rk)               :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
-#endif
-      integer(kind=ik)            :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)            :: max_blocks_row, max_blocks_col, max_local_rows, &
-                                     max_local_cols
-      integer(kind=ik)            :: l_cols, l_rows, l_colh, n_cols
-      integer(kind=ik)            :: istep, lc, ncol, nrow, nb, ns
-
-      real(kind=rk), allocatable  :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
-
-      integer(kind=ik)            :: i
-
-      real(kind=rk), allocatable  :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
-      integer(kind=ik)            :: cwy_blocking, t_blocking, t_cols, t_rows
-      logical, intent(in)         :: useQR
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_band_to_full_real")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of A
-      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      ! t_blocking was formerly 2; 3 is a better choice
-      t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
-
-      ! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked.
-!     if ( na >= ((t_blocking+1)*nbw) ) then
-      cwy_blocking = t_blocking * nbw
-
-      allocate(tmp1(max_local_cols*cwy_blocking))
-      allocate(tmp2(max_local_cols*cwy_blocking))
-      allocate(hvb(max_local_rows*cwy_blocking))
-      allocate(hvm(max_local_rows,cwy_blocking))
-      allocate(tmat_complete(cwy_blocking,cwy_blocking))
-      allocate(t_tmp(cwy_blocking,nbw))
-      allocate(t_tmp2(cwy_blocking,nbw))
-!      else
-!        allocate(tmp1(max_local_cols*nbw))
-!        allocate(tmp2(max_local_cols*nbw))
-!        allocate(hvb(max_local_rows*nbw))
-!        allocate(hvm(max_local_rows,nbw))
-!      endif
-
-      hvm = 0   ! Must be set to 0 !!!
-      hvb = 0   ! Safety only
-
-      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-!     if ( na >= ((t_blocking+1)*nbw) ) then
-
-      do istep=1,((na-1)/nbw-1)/t_blocking + 1
-        ! This the call when using  na >= ((t_blocking+1)*nbw)
-        !      n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
-        ! As an alternative we add some special case handling if na < cwy_blocking
-        IF (na < cwy_blocking) THEN
-          n_cols = MAX(0, na-nbw)
-          IF ( n_cols .eq. 0 ) THEN
-            EXIT
-          END IF
-        ELSE
-          n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
-        END IF
-
-        ! Broadcast all Householder vectors for current step compressed in hvb
-
-        nb = 0
-        ns = 0
-
-        do lc = 1, n_cols
-          ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector
-          nrow = ncol - nbw ! absolute number of pivot row
-
-          l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-          l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number
-
-          if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-
-          nb = nb+l_rows
-
-          if (lc==n_cols .or. mod(ncol,nblk)==0) then
-#ifdef WITH_MPI
-            call MPI_Bcast(hvb(ns+1),nb-ns,MPI_REAL8,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-            ns = nb
-          endif
-        enddo
-
-        ! Expand compressed Householder vectors into matrix hvm
-
-        nb = 0
-        do lc = 1, n_cols
-          nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
-          l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-
-          hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
-          if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
-
-          nb = nb+l_rows
-        enddo
-
-        l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)
-
-        ! compute tmat2 out of tmat(:,:,)
-        tmat_complete = 0
-        do i = 1, t_blocking
-          t_cols = MIN(nbw, n_cols - (i-1)*nbw)
-          if (t_cols <= 0) exit
-          t_rows = (i - 1) * nbw
-          tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
-          if (i > 1) then
-            call dgemm('T', 'N', t_rows, t_cols, l_rows, 1.d0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
-                      max_local_rows, 0.d0, t_tmp, cwy_blocking)
-#ifdef WITH_MPI
-            call mpi_allreduce(t_tmp,t_tmp2,cwy_blocking*nbw,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-            t_tmp2 = t_tmp
-#endif
-            call dtrmm('L','U','N','N',t_rows,t_cols,1.0d0,tmat_complete,cwy_blocking,t_tmp2,cwy_blocking)
-            call dtrmm('R','U','N','N',t_rows,t_cols,-1.0d0,tmat_complete(t_rows+1,t_rows+1),cwy_blocking,t_tmp2,cwy_blocking)
-            tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
-          endif
-        enddo
-
-        ! Q = Q - V * T**T * V**T * Q
-
-        if (l_rows>0) then
-          call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
-                     q,ldq,0.d0,tmp1,n_cols)
-        else
-          tmp1(1:l_cols*n_cols) = 0
-        endif
-#ifdef WITH_MPI
-        call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-        tmp2=tmp1
-#endif
-
-        if (l_rows>0) then
-          call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat_complete,cwy_blocking,tmp2,n_cols)
-          call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,dim=1), tmp2,n_cols,1.d0,q,ldq)
-        endif
-      enddo
-
-!   else 
-!
-!     do istep=1,(na-1)/nbw
-!
-!       n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
-!
-!       ! Broadcast all Householder vectors for current step compressed in hvb
-!
-!       nb = 0
-!       ns = 0
-!
-!       do lc = 1, n_cols
-!         ncol = istep*nbw + lc ! absolute column number of householder vector
-!         nrow = ncol - nbw ! absolute number of pivot row
-!
-!         l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-!         l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number
-!
-!         if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-!
-!         nb = nb+l_rows
-!
-!         if (lc==n_cols .or. mod(ncol,nblk)==0) then
-!           call MPI_Bcast(hvb(ns+1),nb-ns,MPI_REAL8,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr)
-!           ns = nb
-!         endif
-!       enddo
-!
-!       ! Expand compressed Householder vectors into matrix hvm
-!
-!       nb = 0
-!       do lc = 1, n_cols
-!         nrow = (istep-1)*nbw+lc ! absolute number of pivot row
-!         l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-!
-!         hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
-!         if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
-!
-!         nb = nb+l_rows
-!       enddo
-!
-!       l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
-!
-!       ! Q = Q - V * T**T * V**T * Q
-!
-!       if (l_rows>0) then
-!         call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
-!                    q,ldq,0.d0,tmp1,n_cols)
-!       else
-!         tmp1(1:l_cols*n_cols) = 0
-!       endif
-!
-!       call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
-!
-!       if (l_rows>0) then
-!         call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat(1,1,istep),ubound(tmat,dim=1),tmp2,n_cols)
-!         call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,dim=1), &
-!                    tmp2,n_cols,1.d0,q,ldq)
-!       endif
-!     enddo
-!   endif
-
-      deallocate(tmp1, tmp2, hvb, hvm)
-!   if ( na >= ((t_blocking+1)*nbw) ) then
-      deallocate(tmat_complete, t_tmp, t_tmp2)
-!   endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_band_to_full_real")
-#endif
-    end subroutine trans_ev_band_to_full_real
-
-    subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_real, &
-                                 mpi_comm_rows, mpi_comm_cols, mpi_comm)
-
-    !-------------------------------------------------------------------------------
-    ! tridiag_band_real:
-    ! Reduces a real symmetric band matrix to tridiagonal form
-    !
-    !  na          Order of matrix a
-    !
-    !  nb          Semi bandwith
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  a(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
-    !
-    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !  mpi_comm
-    !              MPI-Communicator for the total processor set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)  ::  na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk), intent(in)     :: a(lda,*)
-#else
-      real(kind=rk), intent(in)     :: a(lda,matrixCols)
-#endif
-      real(kind=rk), intent(out)    :: d(na), e(na) ! set only on PE 0
-      real(kind=rk), intent(out), &
-          allocatable               :: hh_trans_real(:,:)
-
-      real(kind=rk)                 :: vnorm2, hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
-      real(kind=rk)                 :: hd(nb), hs(nb)
-
-      integer(kind=ik)              :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
-      integer(kind=ik)              :: my_pe, n_pes, mpierr
-      integer(kind=ik)              :: my_prow, np_rows, my_pcol, np_cols
-      integer(kind=ik)              :: ireq_ab, ireq_hv
-      integer(kind=ik)              :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: max_threads, my_thread, my_block_s, my_block_e, iter
-#ifdef WITH_MPI
-      integer(kind=ik)              :: mpi_status(MPI_STATUS_SIZE)
-#endif
-      integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:)
-      integer(kind=ik), allocatable :: omp_block_limits(:)
-      real(kind=rk), allocatable    :: hv_t(:,:), tau_t(:)
-#endif
-      integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
-      integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
-      integer(kind=ik), allocatable :: block_limits(:)
-      real(kind=rk), allocatable    :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: omp_get_max_threads
-#endif
-
-#ifndef WITH_MPI
-      integer(kind=ik)             :: startAddr
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("tridiag_band_real")
-#endif
-      call mpi_comm_rank(mpi_comm,my_pe,mpierr)
-      call mpi_comm_size(mpi_comm,n_pes,mpierr)
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      ! Get global_id mapping 2D procssor coordinates to global id
-
-      allocate(global_id(0:np_rows-1,0:np_cols-1))
-      global_id(:,:) = 0
-      global_id(my_prow, my_pcol) = my_pe
-#ifdef WITH_OPENMP
-      allocate(global_id_tmp(0:np_rows-1,0:np_cols-1))
-#endif
-
-#ifdef WITH_MPI
-
-#ifndef WITH_OPENMP
-      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-#else
-      global_id_tmp(:,:) = global_id(:,:)
-      call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-      deallocate(global_id_tmp)
-#endif
-
-#endif /* WITH_MPI */
-      ! Total number of blocks in the band:
-
-      nblocks_total = (na-1)/nb + 1
-
-      ! Set work distribution
-
-      allocate(block_limits(0:n_pes))
-      call divide_band(nblocks_total, n_pes, block_limits)
-
-      ! nblocks: the number of blocks for my task
-      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
-
-      ! allocate the part of the band matrix which is needed by this PE
-      ! The size is 1 block larger than needed to avoid extensive shifts
-      allocate(ab(2*nb,(nblocks+1)*nb))
-      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
-
-      ! n_off: Offset of ab within band
-      n_off = block_limits(my_pe)*nb
-
-      ! Redistribute band in a to ab
-      call redist_band_real(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
-
-      ! Calculate the workload for each sweep in the back transformation
-      ! and the space requirements to hold the HH vectors
-
-      allocate(limits(0:np_rows))
-      call determine_workload(na, nb, np_rows, limits)
-      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
-
-      num_hh_vecs = 0
-      num_chunks  = 0
-      nx = na
-      do n = 1, nblocks_total
-        call determine_workload(nx, nb, np_rows, limits)
-        local_size = limits(my_prow+1) - limits(my_prow)
-        ! add to number of householder vectors
-        ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
-        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
-          num_hh_vecs = num_hh_vecs + local_size
-          num_chunks  = num_chunks+1
-        endif
-        nx = nx - nb
-      enddo
-
-      ! Allocate space for HH vectors
-
-      allocate(hh_trans_real(nb,num_hh_vecs))
-
-      ! Allocate and init MPI requests
-
-      allocate(ireq_hhr(num_chunks)) ! Recv requests
-      allocate(ireq_hhs(nblocks))    ! Send requests
-
-      num_hh_vecs = 0
-      num_chunks  = 0
-      nx = na
-      nt = 0
-      do n = 1, nblocks_total
-        call determine_workload(nx, nb, np_rows, limits)
-        local_size = limits(my_prow+1) - limits(my_prow)
-        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
-          num_chunks  = num_chunks+1
-#ifdef WITH_MPI
-
-          call mpi_irecv(hh_trans_real(1,num_hh_vecs+1), nb*local_size, mpi_real8, nt, &
-                           10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
-#else
-          ! carefull non-block recv data copy must be done at wait or send
-          ! hh_trans_real(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
-#endif
-          num_hh_vecs = num_hh_vecs + local_size
-        endif
-        nx = nx - nb
-        if (n == block_limits(nt+1)) then
-          nt = nt + 1
-        endif
-      enddo
-#ifdef WITH_MPI
-      ireq_hhs(:) = MPI_REQUEST_NULL
-#endif
-      ! Buffers for gathering/sending the HH vectors
-
-      allocate(hh_gath(nb,max_blk_size,nblocks)) ! gathers HH vectors
-      allocate(hh_send(nb,max_blk_size,nblocks)) ! send buffer for HH vectors
-      hh_gath(:,:,:) = 0
-      hh_send(:,:,:) = 0
-
-      ! Some counters
-
-      allocate(hh_cnt(nblocks))
-      allocate(hh_dst(nblocks))
-
-      hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
-      hh_dst(:) = 0 ! PE number for receive
-#ifdef WITH_MPI
-      ireq_ab = MPI_REQUEST_NULL
-      ireq_hv = MPI_REQUEST_NULL
-#endif
-      ! Limits for sending
-
-      allocate(snd_limits(0:np_rows,nblocks))
-
-      do iblk=1,nblocks
-        call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
-      enddo
-
-#ifdef WITH_OPENMP
-      ! OpenMP work distribution:
-
-      max_threads = 1
-      max_threads = omp_get_max_threads()
-
-      ! For OpenMP we need at least 2 blocks for every thread
-      max_threads = MIN(max_threads, nblocks/2)
-      if (max_threads==0) max_threads = 1
-
-      allocate(omp_block_limits(0:max_threads))
-
-      ! Get the OpenMP block limits
-      call divide_band(nblocks, max_threads, omp_block_limits)
-
-      allocate(hv_t(nb,max_threads), tau_t(max_threads))
-      hv_t = 0
-      tau_t = 0
-#endif
-
-      ! ---------------------------------------------------------------------------
-      ! Start of calculations
-
-      na_s = block_limits(my_pe)*nb + 1
-
-      if (my_pe>0 .and. na_s<=na) then
-        ! send first column to previous PE
-        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
-        ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
-#ifdef WITH_MPI
-        call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-#endif
-      endif
-
-
-#ifndef WITH_MPI
-          startAddr   = ubound(hh_trans_real,dim=2)
-#endif
-
-#ifdef WITH_OPENMP
-      do istep=1,na-1-block_limits(my_pe)*nb
-#else
-      do istep=1,na-1
-#endif
-
-        if (my_pe==0) then
-          n = MIN(na-na_s,nb) ! number of rows to be reduced
-          hv(:) = 0
-          tau = 0
-          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
-          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
-          if (istep < na-1) then
-            ! Transform first column of remaining matrix
-            vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
-            call hh_transform_real(ab(2,na_s-n_off),vnorm2,hf,tau)
-            hv(1) = 1
-            hv(2:n) = ab(3:n+1,na_s-n_off)*hf
-          endif
-          d(istep) = ab(1,na_s-n_off)
-          e(istep) = ab(2,na_s-n_off)
-          if (istep == na-1) then
-            d(na) = ab(1,na_s+1-n_off)
-            e(na) = 0
-          endif
-        else
-          if (na>na_s) then
-            ! Receive Householder vector from previous task, from PE owning subdiagonal
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-            call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS,mpierr)
-#else
-            call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-            hv(1:nb) = hv_s(1:nb)
-#else
-            hv(1:nb) = hv_s(1:nb)
-#endif
-
-#endif /* WITH_MPI */
-            tau = hv(1)
-            hv(1) = 1.
-          endif
-        endif
-
-        na_s = na_s+1
-        if (na_s-n_off > nb) then
-          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
-          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
-          n_off = n_off + nb
-        endif
-
-
-#ifdef WITH_OPENMP
-        if (max_threads > 1) then
-
-          ! Codepath for OpenMP
-
-          ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
-          ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
-          ! This simulates the behaviour of the MPI tasks which also work after each other.
-          ! The code would be considerably easier, if the MPI communication would be made within
-          ! the parallel region - this is avoided here since this would require
-          ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
-
-          hv_t(:,1) = hv
-          tau_t(1) = tau
-
-          do iter = 1, 2
-
-            ! iter=1 : work on first block
-            ! iter=2 : work on remaining blocks
-            ! This is done in 2 iterations so that we have a barrier in between:
-            ! After the first iteration, it is guaranteed that the last row of the last block
-            ! is completed by the next thread.
-            ! After the first iteration it is also the place to exchange the last row
-            ! with MPI calls
-#ifdef HAVE_DETAILED_TIMINGS
-            call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
-!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
-            do my_thread = 1, max_threads
-
-              if (iter == 1) then
-                my_block_s = omp_block_limits(my_thread-1) + 1
-                my_block_e = my_block_s
-              else
-                my_block_s = omp_block_limits(my_thread-1) + 2
-                my_block_e = omp_block_limits(my_thread)
-              endif
-
-              do iblk = my_block_s, my_block_e
-
-                ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
-                ne = ns+nb-1                    ! last column in block
-
-                if (istep<my_thread .or. ns+n_off>na) exit
-
-                hv = hv_t(:,my_thread)
-                tau = tau_t(my_thread)
-
-                ! Store Householder vector for back transformation
-
-                hh_cnt(iblk) = hh_cnt(iblk) + 1
-
-                hh_gath(1   ,hh_cnt(iblk),iblk) = tau
-                hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
-
-                nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-                nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-                ! Transform diagonal block
-
-                call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
-
-                x = dot_product(hv(1:nc),hd(1:nc))*tau
-                hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
-
-                call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1)
-
-                hv_t(:,my_thread) = 0
-                tau_t(my_thread)  = 0
-
-                if (nr<=0) cycle ! No subdiagonal block present any more
-
-                ! Transform subdiagonal block
-
-                call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
-
-                if (nr>1) then
-
-                  ! complete (old) Householder transformation for first column
-
-                  ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
-
-                  ! calculate new Householder transformation for first column
-                  ! (stored in hv_t(:,my_thread) and tau_t(my_thread))
-
-                  vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
-                  call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
-                  hv_t(1   ,my_thread) = 1.
-                  hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
-                  ab(nb+2:,ns) = 0
-
-                  ! update subdiagonal block for old and new Householder transformation
-                  ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
-
-                  call DGEMV('T',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,0.d0,h(2),1)
-                  x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
-                  h(2:nb) = h(2:nb) - x*hv(2:nb)
-                  ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
-                  do i=2,nb
-                    ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i)
-                  enddo
-
-                else
-
-                  ! No new Householder transformation for nr=1, just complete the old one
-                  ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
-                  do i=2,nb
-                    ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
-                  enddo
-                  ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
-                  hv_t(1,my_thread) = 1.
-
-                endif
-
-              enddo
-
-            enddo ! my_thread
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-            call timer%stop("OpenMP parallel")
-#endif
-
-            if (iter==1) then
-              ! We are at the end of the first block
-
-              ! Send our first column to previous PE
-              if (my_pe>0 .and. na_s <= na) then
-#ifdef WITH_MPI
-                call mpi_wait(ireq_ab,mpi_status,mpierr)
-#endif
-                ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
-#ifdef WITH_MPI
-                call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-#endif
-              endif
-
-              ! Request last column from next PE
-              ne = na_s + nblocks*nb - (max_threads-1) - 1
-#ifdef WITH_MPI
-              if (istep>=max_threads .and. ne <= na) then
-                call mpi_recv(ab(1,ne-n_off),nb+1,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr)
-              endif
-#else
-              if (istep>=max_threads .and. ne <= na) then
-                ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
-              endif
-#endif
-            else
-              ! We are at the end of all blocks
-
-              ! Send last HH vector and TAU to next PE if it has been calculated above
-              ne = na_s + nblocks*nb - (max_threads-1) - 1
-              if (istep>=max_threads .and. ne < na) then
-#ifdef WITH_MPI
-                call mpi_wait(ireq_hv,mpi_status,mpierr)
-#endif
-                hv_s(1) = tau_t(max_threads)
-                hv_s(2:) = hv_t(2:,max_threads)
-#ifdef WITH_MPI
-                call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-#endif
-              endif
-
-              ! "Send" HH vector and TAU to next OpenMP thread
-              do my_thread = max_threads, 2, -1
-                hv_t(:,my_thread) = hv_t(:,my_thread-1)
-                tau_t(my_thread)  = tau_t(my_thread-1)
-              enddo
-
-            endif
-          enddo ! iter
-
-        else
-
-          ! Codepath for 1 thread without OpenMP
-
-          ! The following code is structured in a way to keep waiting times for
-          ! other PEs at a minimum, especially if there is only one block.
-          ! For this reason, it requests the last column as late as possible
-          ! and sends the Householder vector and the first column as early
-          ! as possible.
-
-#endif /* WITH_OPENMP */
-
-          do iblk=1,nblocks
-            ns = na_s + (iblk-1)*nb - n_off ! first column in block
-            ne = ns+nb-1                    ! last column in block
-
-            if (ns+n_off>na) exit
-
-            ! Store Householder vector for back transformation
-
-            hh_cnt(iblk) = hh_cnt(iblk) + 1
-
-            hh_gath(1   ,hh_cnt(iblk),iblk) = tau
-            hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
-
-#ifndef WITH_OPENMP
-            if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
-              ! Wait for last transfer to finish
-#ifdef WITH_MPI
-              call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
-#endif
-              ! Copy vectors into send buffer
-              hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
-              ! Send to destination
-#ifdef WITH_MPI
-              call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, &
-                           global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
-                           10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
-#else
-             startAddr = startAddr - hh_cnt(iblk)
-             hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
-#endif /* WITH_MPI */
-
-            ! Reset counter and increase destination row
-              hh_cnt(iblk) = 0
-              hh_dst(iblk) = hh_dst(iblk)+1
-            endif
-
-            ! The following code is structured in a way to keep waiting times for
-            ! other PEs at a minimum, especially if there is only one block.
-            ! For this reason, it requests the last column as late as possible
-            ! and sends the Householder vector and the first column as early
-            ! as possible.
-#endif /* WITH_OPENMP */
-            nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-            nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                          ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-            ! Multiply diagonal block and subdiagonal block with Householder vector
-
-            if (iblk==nblocks .and. nc==nb) then
-
-              ! We need the last column from the next PE.
-              ! First do the matrix multiplications without last column ...
-
-              ! Diagonal block, the contribution of the last element is added below!
-              ab(1,ne) = 0
-              call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
-
-              ! Subdiagonal block
-              if (nr>0) call DGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
-
-              ! ... then request last column ...
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-              call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS,mpierr)
-#else
-              call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-              ab(1:nb+1,ne) = ab_s(1:nb+1)
-#else
-              ab(1:nb+1,ne) = ab_s(1:nb+1)
-#endif
-
-#endif /* WITH_MPI */
-              ! ... and complete the result
-              hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
-              hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
-
-            else
-
-              ! Normal matrix multiply
-              call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
-              if (nr>0) call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
-
-            endif
-
-            ! Calculate first column of subdiagonal block and calculate new
-            ! Householder transformation for this column
-
-            hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb
-            tau_new = 0
-
-            if (nr>0) then
-
-              ! complete (old) Householder transformation for first column
-
-              ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
-
-              ! calculate new Householder transformation ...
-              if (nr>1) then
-                vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
-                call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_new)
-                hv_new(1) = 1.
-                hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
-                ab(nb+2:,ns) = 0
-              endif
-
-              ! ... and send it away immediatly if this is the last block
-
-              if (iblk==nblocks) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-                call mpi_wait(ireq_hv,MPI_STATUS,mpierr)
-#else
-                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
-#endif
-
-#endif /* WITH_MPI */
-                hv_s(1) = tau_new
-                hv_s(2:) = hv_new(2:)
-#ifdef WITH_MPI
-                call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-#endif
-              endif
-
-            endif
-
-            ! Transform diagonal block
-            x = dot_product(hv(1:nc),hd(1:nc))*tau
-            hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
-
-            if (my_pe>0 .and. iblk==1) then
-
-              ! The first column of the diagonal block has to be send to the previous PE
-              ! Calculate first column only ...
-
-              ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1)
-
-              ! ... send it away ...
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-              call mpi_wait(ireq_ab,MPI_STATUS,mpierr)
-#else
-              call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
-#endif
-
-#endif /* WITH_MPI */
-              ab_s(1:nb+1) = ab(1:nb+1,ns)
-#ifdef WITH_MPI
-              call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-#endif
-              ! ... and calculate remaining columns with rank-2 update
-              if (nc>1) call DSYR2('L',nc-1,-1.d0,hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1)
-            else
-              ! No need to  send, just a rank-2 update
-              call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1)
-            endif
-
-            ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
-
-            if (nr>0) then
-              if (nr>1) then
-                call DGEMV('T',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,0.d0,h(2),1)
-                x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
-                h(2:nb) = h(2:nb) - x*hv(2:nb)
-                ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
-                do i=2,nb
-                  ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i)
-                enddo
-              else
-                ! No double Householder transformation for nr=1, just complete the row
-                do i=2,nb
-                  ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
-                enddo
-              endif
-            endif
-
-            ! Use new HH vector for the next block
-            hv(:) = hv_new(:)
-            tau = tau_new
-
-          enddo
-
-#ifdef WITH_OPENMP
-        endif
-
-        do iblk = 1, nblocks
-
-          if (hh_dst(iblk) >= np_rows) exit
-          if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
-
-          if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
-            ! Wait for last transfer to finish
-#ifdef WITH_MPI
-            call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr)
-#endif
-            ! Copy vectors into send buffer
-            hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
-            ! Send to destination
-#ifdef WITH_MPI
-            call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, &
-                  global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
-                  10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
-#else
-            startAddr = startAddr - hh_cnt(iblk)
-            hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
-#endif
-            ! Reset counter and increase destination row
-            hh_cnt(iblk) = 0
-            hh_dst(iblk) = hh_dst(iblk)+1
-          endif
-
-        enddo
-#endif /* WITH_OPENMP */
-      enddo
-
-      ! Finish the last outstanding requests
-
-#ifdef WITH_OPENMP
-
-#ifdef WITH_MPI
-      call mpi_wait(ireq_ab,MPI_STATUS,mpierr)
-      call mpi_wait(ireq_hv,MPI_STATUS,mpierr)
-
-      allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)))
-      call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES, mpierr)
-      call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES, mpierr)
-      deallocate(mpi_statuses)
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-      call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
-      call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
-
-      call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
-      call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_OPENMP */
-
-#ifdef  WITH_MPI
-      call mpi_barrier(mpi_comm,mpierr)
-#endif
-      deallocate(ab)
-      deallocate(ireq_hhr, ireq_hhs)
-      deallocate(hh_cnt, hh_dst)
-      deallocate(hh_gath, hh_send)
-      deallocate(limits, snd_limits)
-      deallocate(block_limits)
-      deallocate(global_id)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("tridiag_band_real")
-#endif
-
-    end subroutine tridiag_band_real
-
-    subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
-                                           mpi_comm_rows, mpi_comm_cols, wantDebug, success, &
-                                           THIS_REAL_ELPA_KERNEL)
-    !-------------------------------------------------------------------------------
-    !  trans_ev_tridi_to_band_real:
-    !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nev         Number eigenvectors to compute (= columns of matrix q)
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nb          semi bandwith
-    !
-    !  q           On input: Eigenvectors of tridiagonal matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !  matrixCols  local columns of matrix q
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns/both
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      use pack_unpack_real
-      use compute_hh_trafo_real
-      implicit none
-
-      integer(kind=ik), intent(in)  :: THIS_REAL_ELPA_KERNEL
-      integer(kind=ik), intent(in)  :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      real(kind=rk)                 :: q(ldq,*)
-#else
-      real(kind=rk)                 :: q(ldq,matrixCols)
-#endif
-      real(kind=rk), intent(in)     :: hh_trans_real(:,:)
-      integer(kind=ik)              :: np_rows, my_prow, np_cols, my_pcol
-
-      integer(kind=ik)              :: i, j, ip, sweep, nbuf, l_nev, a_dim2
-      integer(kind=ik)              :: current_n, current_local_n, current_n_start, current_n_end
-      integer(kind=ik)              :: next_n, next_local_n, next_n_start, next_n_end
-      integer(kind=ik)              :: bottom_msg_length, top_msg_length, next_top_msg_length
-      integer(kind=ik)              :: stripe_width, last_stripe_width, stripe_count
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: thread_width, csw, b_off, b_len
-#endif
-      integer(kind=ik)              :: num_result_blocks, num_result_buffers, num_bufs_recvd
-      integer(kind=ik)              :: a_off, current_tv_off, max_blk_size
-      integer(kind=ik)              :: mpierr, src, src_offset, dst, offset, nfact, num_blk
-#ifdef WITH_OPENMP
-#ifdef WITH_MPI
-      integer(kind=ik)              :: mpi_status(MPI_STATUS_SIZE)
-#endif
-#endif
-      logical                       :: flag
-
-#ifdef WITH_OPENMP
-      real(kind=rk), pointer        :: a(:,:,:,:)
-#else
-      real(kind=rk), pointer        :: a(:,:,:)
-#endif
-      real(kind=rk)                 :: a_real
-
-      type(c_ptr)                   :: a_ptr
-      real(kind=rk), allocatable    :: row(:)
-
-#ifdef WITH_OPENMP
-      real(kind=rk), allocatable    :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
-      real(kind=rk), allocatable    :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
-#else
-      real(kind=rk), allocatable    :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
-      real(kind=rk), allocatable    :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
-#endif
-      real(kind=rk), allocatable    :: result_buffer(:,:,:)
-      real(kind=rk), allocatable    :: bcast_buffer(:,:)
-
-      integer(kind=ik)              :: n_off
-      integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
-      integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
-      integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
-#ifdef WITH_OPENMP
-      integer(kind=ik), allocatable :: mpi_statuses(:,:)
-#endif
-      ! MPI send/recv tags, arbitrary
-
-      integer(kind=ik), parameter  :: bottom_recv_tag = 111
-      integer(kind=ik), parameter  :: top_recv_tag    = 222
-      integer(kind=ik), parameter  :: result_recv_tag = 333
-
-      ! Just for measuring the kernel performance
-      real(kind=rk)                :: kernel_time
-      ! long integer
-      integer(kind=lik)            :: kernel_flops
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)             :: max_threads, my_thread
-      integer(kind=ik)             :: omp_get_max_threads
-#endif
-
-      logical, intent(in)          :: wantDebug
-      logical                      :: success
-#ifndef WITH_MPI
-      integer(kind=ik)             :: j1
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_tridi_to_band_real")
-#endif
-      success = .true.
-      kernel_time = 1.d-100
-      kernel_flops = 0
-
-#ifdef WITH_OPENMP
-      max_threads = 1
-      max_threads = omp_get_max_threads()
-#endif
-      call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
-      call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
-      call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
-      call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
-      if (mod(nbw,nblk)/=0) then
-        if (my_prow==0 .and. my_pcol==0) then
-          if (wantDebug) then
-            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: ERROR: nbw=',nbw,', nblk=',nblk
-            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: band backtransform works only for nbw==n*nblk'
-          endif
-          success = .false.
-          return
-        endif
-      endif
-
-      nfact = nbw / nblk
-
-
-      ! local number of eigenvectors
-      l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
-
-      if (l_nev==0) then
-#ifdef WITH_OPENMP
-        thread_width = 0
-#endif
-        stripe_width = 0
-        stripe_count = 0
-        last_stripe_width = 0
-      else
-        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
-        ! every primary cache
-#ifdef WITH_OPENMP
-        thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
-#endif
-        stripe_width = 48 ! Must be a multiple of 4
-#ifdef WITH_OPENMP
-        stripe_count = (thread_width-1)/stripe_width + 1
-#else
-        stripe_count = (l_nev-1)/stripe_width + 1
-#endif
-        ! Adapt stripe width so that last one doesn't get too small
-#ifdef WITH_OPENMP
-        stripe_width = (thread_width-1)/stripe_count + 1
-#else
-        stripe_width = (l_nev-1)/stripe_count + 1
-#endif
-        stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
-        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
-      endif
-
-      ! Determine the matrix distribution at the beginning
-
-      allocate(limits(0:np_rows))
-
-      call determine_workload(na, nbw, np_rows, limits)
-
-      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
-
-      a_dim2 = max_blk_size + nbw
-
-#ifdef WITH_OPENMP
-      if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_real)) /= 0) then
-#else
-      if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_real)) /= 0) then
-#endif
-        write(error_unit,*) "Cannot allocate memory"
-        success = .false.
-        return
-      endif
-
-      call c_f_pointer(a_ptr, a, &
-#ifdef WITH_OPENMP
-          [stripe_width,a_dim2,stripe_count,max_threads] &
-#else
-          [stripe_width,a_dim2,stripe_count] &
-#endif
-        )
-
-#ifndef WITH_OPENMP
-      a(:,:,:) = 0
-#else
-      ! a(:,:,:,:) should be set to 0 in a parallel region, not here!
-#endif
-
-      allocate(row(l_nev))
-      row(:) = 0
-
-      ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
-      ! and transpose the matrix using stripes of given stripe_width for cache blocking.
-
-      ! The peculiar way it is done below is due to the fact that the last row should be
-      ! ready first since it is the first one to start below
-
-#ifdef WITH_OPENMP
-      ! Please note about the OMP usage below:
-      ! This is not for speed, but because we want the matrix a in the memory and
-      ! in the cache of the correct thread (if possible)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("OpenMP parallel")
-#endif
-
-      !$omp parallel do private(my_thread), schedule(static, 1)
-      do my_thread = 1, max_threads
-        a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation!
-      enddo
-      !$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("OpenMP parallel")
-#endif
-#endif
-
-      do ip = np_rows-1, 0, -1
-        if (my_prow == ip) then
-          ! Receive my rows which have not yet been received
-          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
-          do i=limits(ip)+1,limits(ip+1)
-            src = mod((i-1)/nblk, np_rows)
-            if (src < my_prow) then
-#ifdef WITH_OPENMP
-#ifdef WITH_MPI
-              call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr)
-#else
-              row(1:l_nev) = row(1:l_nev)
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_real_cpu_openmp(a, row,i-limits(ip),my_thread, stripe_count, &
-                                                thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-              call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
-#else
-              row(1:l_nev) = row(1:l_nev)
-#endif
-              call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
-#endif /* WITH_OPENMP */
-            elseif (src==my_prow) then
-              src_offset = src_offset+1
-              row(:) = q(src_offset, 1:l_nev)
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_real_cpu_openmp(a, row,i-limits(ip),my_thread, &
-                                                stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else /* WITH_OPENMP */
-              call unpack_row_real_cpu(a, row,i-limits(ip),  stripe_count, stripe_width, last_stripe_width)
-#endif /* WITH_OPENMP */
-
-            endif
-          enddo
-          ! Send all rows which have not yet been send
-          src_offset = 0
-          do dst = 0, ip-1
-            do i=limits(dst)+1,limits(dst+1)
-              if (mod((i-1)/nblk, np_rows) == my_prow) then
-                src_offset = src_offset+1
-                row(:) = q(src_offset, 1:l_nev)
-#ifdef WITH_MPI
-                call MPI_Send(row, l_nev, MPI_REAL8, dst, 0, mpi_comm_rows, mpierr)
-#endif
-              endif
-            enddo
-          enddo
-          else if (my_prow < ip) then
-            ! Send all rows going to PE ip
-            src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
-            do i=limits(ip)+1,limits(ip+1)
-              src = mod((i-1)/nblk, np_rows)
-              if (src == my_prow) then
-                src_offset = src_offset+1
-                row(:) = q(src_offset, 1:l_nev)
-#ifdef WITH_MPI
-                call MPI_Send(row, l_nev, MPI_REAL8, ip, 0, mpi_comm_rows, mpierr)
-#endif
-              endif
-            enddo
-            ! Receive all rows from PE ip
-            do i=limits(my_prow)+1,limits(my_prow+1)
-              src = mod((i-1)/nblk, np_rows)
-              if (src == ip) then
-#ifdef WITH_OPENMP
-#ifdef WITH_MPI
-              call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr)
-#else
-              row(1:l_nev) = row(1:l_nev)
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_real_cpu_openmp(a, row,i-limits(my_prow),my_thread, &
-                                                stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-              call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
-#else
-              row(1:l_nev) = row(1:l_nev)
-#endif
-              call unpack_row_real_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
-#endif  /* WITH_OPENMP */
-
-            endif
-          enddo
-        endif
-      enddo
-
-      ! Set up result buffer queue
-
-      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
-
-      num_result_buffers = 4*nfact
-      allocate(result_buffer(l_nev,nblk,num_result_buffers))
-
-      allocate(result_send_request(num_result_buffers))
-      allocate(result_recv_request(num_result_buffers))
-#ifdef WITH_MPI
-      result_send_request(:) = MPI_REQUEST_NULL
-      result_recv_request(:) = MPI_REQUEST_NULL
-#endif
-      ! Queue up buffers
-
-      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
-        do j = 1, min(num_result_buffers, num_result_blocks)
-#ifdef WITH_MPI
-          call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, &
-                              mpi_comm_rows, result_recv_request(j), mpierr)
-#else
-          ! carefull the "recv" has to be done at the corresponding wait or send
-          ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)
-#endif
-        enddo
-      endif
-
-      num_bufs_recvd = 0 ! No buffers received yet
-
-      ! Initialize top/bottom requests
-
-      allocate(top_send_request(stripe_count))
-      allocate(top_recv_request(stripe_count))
-      allocate(bottom_send_request(stripe_count))
-      allocate(bottom_recv_request(stripe_count))
-#ifdef WITH_MPI
-      top_send_request(:) = MPI_REQUEST_NULL
-      top_recv_request(:) = MPI_REQUEST_NULL
-      bottom_send_request(:) = MPI_REQUEST_NULL
-      bottom_recv_request(:) = MPI_REQUEST_NULL
-#endif
-
-#ifdef WITH_OPENMP
-      allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count))
-
-      top_border_send_buffer(:,:) = 0
-      top_border_recv_buffer(:,:) = 0
-      bottom_border_send_buffer(:,:) = 0
-      bottom_border_recv_buffer(:,:) = 0
-
-      ! Initialize broadcast buffer
-#else
-      allocate(top_border_send_buffer(stripe_width, nbw, stripe_count))
-      allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count))
-      allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count))
-      allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count))
-
-      top_border_send_buffer(:,:,:) = 0
-      top_border_recv_buffer(:,:,:) = 0
-      bottom_border_send_buffer(:,:,:) = 0
-      bottom_border_recv_buffer(:,:,:) = 0
-#endif
-
-      allocate(bcast_buffer(nbw, max_blk_size))
-      bcast_buffer = 0
-
-      current_tv_off = 0 ! Offset of next row to be broadcast
-
-       ! ------------------- start of work loop -------------------
-
-      a_off = 0 ! offset in A (to avoid unnecessary shifts)
-
-      top_msg_length = 0
-      bottom_msg_length = 0
-
-      do sweep = 0, (na-1)/nbw
-
-        current_n = na - sweep*nbw
-        call determine_workload(current_n, nbw, np_rows, limits)
-        current_n_start = limits(my_prow)
-        current_n_end   = limits(my_prow+1)
-        current_local_n = current_n_end - current_n_start
-
-        next_n = max(current_n - nbw, 0)
-        call determine_workload(next_n, nbw, np_rows, limits)
-        next_n_start = limits(my_prow)
-        next_n_end   = limits(my_prow+1)
-        next_local_n = next_n_end - next_n_start
-
-        if (next_n_end < next_n) then
-          bottom_msg_length = current_n_end - next_n_end
-        else
-          bottom_msg_length = 0
-        endif
-
-        if (next_local_n > 0) then
-          next_top_msg_length = current_n_start - next_n_start
-        else
-          next_top_msg_length = 0
-        endif
-
-        if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
-          do i = 1, stripe_count
-#ifdef WITH_OPENMP
-            csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
-            b_len = csw*nbw*max_threads
-#ifdef WITH_MPI
-            call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow+1, bottom_recv_tag, &
-                           mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-!            carefull the "recieve" has to be done at the corresponding wait or send
-!            bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-            call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, &
-                        mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-!            carefull the recieve has to be done at the corresponding wait or send
-!            bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
-#endif
-
-#endif /* WITH_OPENMP */
-          enddo
-        endif
-
-        if (current_local_n > 1) then
-          if (my_pcol == mod(sweep,np_cols)) then
-            bcast_buffer(:,1:current_local_n) = hh_trans_real(:,current_tv_off+1:current_tv_off+current_local_n)
-            current_tv_off = current_tv_off + current_local_n
-          endif
-#ifdef WITH_MPI
-          call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_REAL8, mod(sweep,np_cols), mpi_comm_cols, mpierr)
-#endif
-        else
-          ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real
-          bcast_buffer(:,1) = 0
-        endif
-
-        if (l_nev == 0) cycle
-
-        if (current_local_n > 0) then
-
-          do i = 1, stripe_count
-#ifdef WITH_OPENMP
-            ! Get real stripe width for strip i;
-            ! The last OpenMP tasks may have an even smaller stripe with,
-            ! but we don't care about this, i.e. we send/recv a bit too much in this case.
-            ! csw: current_stripe_width
-
-            csw = min(stripe_width, thread_width-(i-1)*stripe_width)
-#endif
-            !wait_b
-            if (current_n_end < current_n) then
-#ifdef WITH_OPENMP
-#ifdef WITH_MPI
-              call MPI_Wait(bottom_recv_request(i), MPI_STATUS, mpierr)
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
-              do my_thread = 1, max_threads
-                n_off = current_local_n+a_off
-                b_len = csw*nbw
-                b_off = (my_thread-1)*b_len
-                a(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
-                  reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-#ifdef WITH_MPI
-              call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-              n_off = current_local_n+a_off
-              a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
-
-#endif
-              if (next_n_end < next_n) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-                call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
-                                   MPI_REAL8, my_prow+1, bottom_recv_tag, &
-                                   mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-                call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, &
-                                   mpi_comm_rows, bottom_recv_request(i), mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-!                carefull the recieve has to be done at the corresponding wait or send
-!                bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
-#else
-!                carefull the recieve has to be done at the corresponding wait or send
-!                bottom_border_recv_buffer(1:stripe_width,1:nbw,i) =  top_border_send_buffer(1:stripe_width,1:nbw,i)
-#endif
-
-#endif /* WITH_MPI */
-              endif
-            endif
-
-            if (current_local_n <= bottom_msg_length + top_msg_length) then
-
-              !wait_t
-              if (top_msg_length>0) then
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-                call MPI_Wait(top_recv_request(i), MPI_STATUS, mpierr)
-#else
-                call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-                a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
-#endif
-#endif
-              endif
-
-              !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
-              do my_thread = 1, max_threads
-                if (top_msg_length>0) then
-                  b_len = csw*top_msg_length
-                  b_off = (my_thread-1)*b_len
-                  a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
-                             reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
-                endif
-                call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
-                    a_off, nbw, max_blk_size, bcast_buffer,  kernel_flops, kernel_time, &
-0, current_local_n, i, my_thread, thread_width, &
-                                         THIS_REAL_ELPA_KERNEL)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
-                  a_off,  nbw, max_blk_size, bcast_buffer,  kernel_flops, kernel_time, &
-0, current_local_n, i, &
-                                     last_stripe_width, THIS_REAL_ELPA_KERNEL)
-#endif
-              !send_b
-#ifdef WITH_OPENMP
-
-#ifdef WITH_MPI
-              call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
-#endif
-              if (bottom_msg_length>0) then
-                n_off = current_local_n+nbw-bottom_msg_length+a_off
-                b_len = csw*bottom_msg_length*max_threads
-                bottom_border_send_buffer(1:b_len,i) = &
-                    reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
-#ifdef WITH_MPI
-                call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, &
-                               top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                if (next_top_msg_length > 0) then
-                  top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
-                      bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
-                endif
-#endif
-              endif
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-              call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-              if (bottom_msg_length>0) then
-                n_off = current_local_n+nbw-bottom_msg_length+a_off
-                bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
-#ifdef WITH_MPI
-                call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, &
-                               top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                if (next_top_msg_length > 0) then
-                  top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
-                  bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
-                endif
-#endif
-              endif
-#endif /* WITH_OPENMP */
-            else
-
-              !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
-                    a_off, nbw, max_blk_size,  bcast_buffer, kernel_flops, kernel_time, &
-current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, thread_width, &
-                                    THIS_REAL_ELPA_KERNEL)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-              !send_b
-#ifdef WITH_MPI
-              call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
-#endif
-              if (bottom_msg_length > 0) then
-                n_off = current_local_n+nbw-bottom_msg_length+a_off
-                b_len = csw*bottom_msg_length*max_threads
-                bottom_border_send_buffer(1:b_len,i) = &
-                    reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
-#ifdef WITH_MPI
-                call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, &
-                                 top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                if (next_top_msg_length > 0) then
-                  top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
-                      bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
-                endif
-#endif
-              endif
-#else /* WITH_OPENMP */
-              call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
-                  a_off,  nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-current_local_n - bottom_msg_length, bottom_msg_length, i, &
-                                       last_stripe_width, THIS_REAL_ELPA_KERNEL)
-
-              !send_b
-#ifdef WITH_MPI
-              call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-              if (bottom_msg_length > 0) then
-                n_off = current_local_n+nbw-bottom_msg_length+a_off
-                bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
-#ifdef WITH_MPI
-                call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, &
-                               top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                if (next_top_msg_length > 0) then
-                  top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
-                  bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
-                endif
-#endif
-              endif
-#endif /* WITH_OPENMP */
-
-              !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
-                    a_off,  nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                    top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
-                                      THIS_REAL_ELPA_KERNEL)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
-                  a_off,  nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
-                              last_stripe_width, THIS_REAL_ELPA_KERNEL)
-
-#endif
-              !wait_t
-              if (top_msg_length>0) then
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-                call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
-#else
-                call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-                a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
-#endif
-#endif
-              endif
-
-              !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
-              do my_thread = 1, max_threads
-                if (top_msg_length>0) then
-                  b_len = csw*top_msg_length
-                  b_off = (my_thread-1)*b_len
-                  a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
-                    reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
-                endif
-                call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
-                    a_off, nbw, max_blk_size,  bcast_buffer, kernel_flops, kernel_time, &
-                                                      0, top_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
-                  a_off, nbw, max_blk_size,  bcast_buffer,  kernel_flops, kernel_time, &
-                                             0, top_msg_length, i,  &
-                                             last_stripe_width, THIS_REAL_ELPA_KERNEL)
-#endif
-            endif
-
-            if (next_top_msg_length > 0) then
-              !request top_border data
-#ifdef WITH_OPENMP
-              b_len = csw*next_top_msg_length*max_threads
-#ifdef WITH_MPI
-              call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow-1, &
-                             top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
-#else
-!             carefull the "recieve" has to be done at the corresponding wait or send
-!              top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-              call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_REAL8, my_prow-1, &
-                             top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
-#else
-!             carefull the "recieve" has to be done at the corresponding wait or send
-!              top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
-!               bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
-#endif
-
-#endif /* WITH_OPENMP */
-            endif
-
-            !send_t
-            if (my_prow > 0) then
-#ifdef WITH_OPENMP
-
-#ifdef WITH_MPI
-              call MPI_Wait(top_send_request(i), mpi_status, mpierr)
-#endif
-              b_len = csw*nbw*max_threads
-              top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
-#ifdef WITH_MPI
-              call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_REAL8, &
-                             my_prow-1, bottom_recv_tag, &
-                             mpi_comm_rows, top_send_request(i), mpierr)
-#else
-              if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
-                bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
-              endif
-              if (next_n_end < next_n) then
-                bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
-              endif
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-             call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-             top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i)
-#ifdef WITH_MPI
-             call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow-1, bottom_recv_tag, &
-                            mpi_comm_rows, top_send_request(i), mpierr)
-#else
-             if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
-               bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
-             endif
-             if (next_n_end < next_n) then
-               bottom_border_recv_buffer(1:stripe_width,1:nbw,i) =  top_border_send_buffer(1:stripe_width,1:nbw,i)
-             endif
-#endif
-
-#endif /* WITH_OPENMP */
-
-           endif
-
-           ! Care that there are not too many outstanding top_recv_request's
-           if (stripe_count > 1) then
-             if (i>1) then
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-               call MPI_Wait(top_recv_request(i-1), MPI_STATUS, mpierr)
-#else
-               call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
-#endif
-#endif
-             else
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-               call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS, mpierr)
-#else
-               call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
-#endif
-#endif
-             endif
-           endif
-         enddo
-
-         top_msg_length = next_top_msg_length
-       else
-         ! wait for last top_send_request
-#ifdef WITH_MPI
-         do i = 1, stripe_count
-#ifdef WITH_OPENMP
-           call MPI_Wait(top_send_request(i), MPI_STATUS, mpierr)
-#else
-           call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-         enddo
-#endif
-       endif
-
-       ! Care about the result
-
-       if (my_prow == 0) then
-
-         ! topmost process sends nbw rows to destination processes
-
-         do j=0,nfact-1
-           num_blk = sweep*nfact+j ! global number of destination block, 0 based
-           if (num_blk*nblk >= na) exit
-
-             nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-             call MPI_Wait(result_send_request(nbuf), MPI_STATUS, mpierr)
-#else
-             call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
-#endif
-#endif
-             dst = mod(num_blk, np_rows)
-
-             if (dst == 0) then
-               do i = 1, min(na - num_blk*nblk, nblk)
-#ifdef  WITH_OPENMP
-                 call pack_row_real_cpu_openmp(a, row, j*nblk+i+a_off, stripe_width, &
-                                               stripe_count, max_threads, thread_width, l_nev)
-#else
-                 call pack_row_real_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
-
-#endif
-                 q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
-               enddo
-             else
-               do i = 1, nblk
-#ifdef WITH_OPENMP
-                call pack_row_real_cpu_openmp(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, &
-                                              stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-                call pack_row_real_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
-
-#endif
-               enddo
-#ifdef WITH_MPI
-               call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, dst, &
-                                    result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
-#else
-               if (j+num_result_buffers < num_result_blocks) &
-                   result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf)
-               if (my_prow > 0 .and. l_nev>0) then
-                 do j1 = 1, min(num_result_buffers, num_result_blocks)
-                   result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf)
-                 enddo
-               endif
-#endif
-             endif
-           enddo
-
-         else
-
-           ! receive and store final result
-
-           do j = num_bufs_recvd, num_result_blocks-1
-
-             nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
-
-             ! If there is still work to do, just test for the next result request
-             ! and leave the loop if it is not ready, otherwise wait for all
-             ! outstanding requests
-
-             if (next_local_n > 0) then
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-              call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS, mpierr)
-#else
-              call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
-
-#endif
-
-#else /* WITH_MPI */
-              flag = .true.
-#endif /* WITH_MPI */
-
-              if (.not.flag) exit
-            else
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-              call MPI_Wait(result_recv_request(nbuf), MPI_STATUS, mpierr)
-#else
-              call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
-#endif
-#endif
-            endif
-
-            ! Fill result buffer into q
-            num_blk = j*np_rows + my_prow ! global number of current block, 0 based
-            do i = 1, min(na - num_blk*nblk, nblk)
-              q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
-            enddo
-
-            ! Queue result buffer again if there are outstanding blocks left
-#ifdef WITH_MPI
-            if (j+num_result_buffers < num_result_blocks) &
-                      call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, &
-                                     mpi_comm_rows, result_recv_request(nbuf), mpierr)
-#else
-            ! carefull the "recieve" has to be done at the corresponding wait or send
-!            if (j+num_result_buffers < num_result_blocks) &
-!                result_buffer(1:l_nev*nblk,1,nbuf) =  result_buffer(1:l_nev*nblk,1,nbuf)
-#endif
-          enddo
-          num_bufs_recvd = j
-
-        endif
-
-        ! Shift the remaining rows to the front of A (if necessary)
-
-        offset = nbw - top_msg_length
-        if (offset<0) then
-          if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: internal error, offset for shifting = ',offset
-          success = .false.
-          return
-        endif
-
-        a_off = a_off + offset
-        if (a_off + next_local_n + nbw > a_dim2) then
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%start("OpenMP parallel")
-#endif
-
- !$omp parallel do private(my_thread, i, j), schedule(static, 1)
-          do my_thread = 1, max_threads
-            do i = 1, stripe_count
-              do j = top_msg_length+1, top_msg_length+next_local_n
-                A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread)
-              enddo
-#else
-          do i = 1, stripe_count
-            do j = top_msg_length+1, top_msg_length+next_local_n
-              A(:,j,i) = A(:,j+a_off,i)
-#endif
-            enddo
-          enddo
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("OpenMP parallel")
-#endif
-#endif
-          a_off = 0
-        endif
-
-      enddo
-#ifdef WITH_MPI
-      ! Just for safety:
-      if (ANY(top_send_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
-      if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
-      if (ANY(top_recv_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
-      if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
-#endif
-      if (my_prow == 0) then
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-        allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers))
-        call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr)
-        deallocate(mpi_statuses)
-#else
-        call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
-#endif
-#endif
-      endif
-#ifdef WITH_MPI
-      if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
-      if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
-#endif
-      if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-         write(error_unit,'(" Kernel time:",f10.3," MFlops: ",f10.3)')  kernel_time, kernel_flops/kernel_time*1.d-6
-
-      ! deallocate all working space
-
-      nullify(a)
-      call free(a_ptr)
-      deallocate(row)
-      deallocate(limits)
-      deallocate(result_send_request)
-      deallocate(result_recv_request)
-      deallocate(top_border_send_buffer)
-      deallocate(top_border_recv_buffer)
-      deallocate(bottom_border_send_buffer)
-      deallocate(bottom_border_recv_buffer)
-      deallocate(result_buffer)
-      deallocate(bcast_buffer)
-      deallocate(top_send_request)
-      deallocate(top_recv_request)
-      deallocate(bottom_send_request)
-      deallocate(bottom_recv_request)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_tridi_to_band_real")
-#endif
-      return
-    end subroutine  trans_ev_tridi_to_band_real
-
-    subroutine single_hh_trafo(q, hh, nb, nq, ldq)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      ! Perform single real Householder transformation.
-      ! This routine is not performance critical and thus it is coded here in Fortran
-
-      implicit none
-      integer(kind=ik)  :: nb, nq, ldq
-      real(kind=rk)     :: q(ldq, *), hh(*)
-
-      integer(kind=ik)  :: i
-      real(kind=rk)     :: v(nq)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("single_hh_trafo")
-#endif
-
-      ! v = q * hh
-      v(:) = q(1:nq,1)
-      do i=2,nb
-        v(:) = v(:) + q(1:nq,i) * hh(i)
-      enddo
-
-      ! v = v * tau
-      v(:) = v(:) * hh(1)
-
-      ! q = q - v * hh**T
-      q(1:nq,1) = q(1:nq,1) - v(:)
-      do i=2,nb
-        q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("single_hh_trafo")
-#endif
-    end subroutine
-
-    subroutine determine_workload(na, nb, nprocs, limits)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)  :: na, nb, nprocs
-      integer(kind=ik), intent(out) :: limits(0:nprocs)
-
-      integer(kind=ik)              :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("determine_workload")
-#endif
-
-      if (na <= 0) then
-        limits(:) = 0
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("determine_workload")
-#endif
-        return
-      endif
-
-      if (nb*nprocs > na) then
-          ! there is not enough work for all
-        do i = 0, nprocs
-          limits(i) = min(na, i*nb)
-        enddo
-      else
-         do i = 0, nprocs
-           limits(i) = (i*na)/nprocs
-         enddo
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("determine_workload")
-#endif
-    end subroutine
-
-    subroutine bandred_complex(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, success)
-
-    !-------------------------------------------------------------------------------
-    !  bandred_complex: Reduces a distributed hermitian matrix to band form
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix
-    !
-    !  a(lda,matrixCols)    Distributed matrix which should be reduced.
-    !              Distribution is like in Scalapack.
-    !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
-    !              a(:,:) is overwritten on exit with the band and the Householder vectors
-    !              in the upper half.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nbw         semi bandwith of output matrix
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
-    !              Factors for the Householder vectors (returned), needed for back transformation
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: a(lda,*), tmat(nbw,nbw,*)
-#else
-      complex(kind=ck)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
-#endif
-      complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: l_cols, l_rows
-      integer(kind=ik)              :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
-      integer(kind=ik)              :: istep, ncol, lch, lcx, nlc
-      integer(kind=ik)              :: tile_size, l_rows_tile, l_cols_tile
-
-      real(kind=rk)                 :: vnorm2
-      complex(kind=ck)              :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
-
-      complex(kind=ck), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
-
-      logical, intent(in)           :: wantDebug
-      logical, intent(out)          :: success
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("bandred_complex")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      success = .true.
-
-      ! Semibandwith nbw must be a multiple of blocksize nblk
-
-      if (mod(nbw,nblk)/=0) then
-        if (my_prow==0 .and. my_pcol==0) then
-          if (wantDebug) then
-            write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk
-            write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk'
-          endif
-          success = .false.
-          return
-        endif
-      endif
-
-      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
-
-      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
-      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
-
-      l_rows_tile = tile_size/np_rows ! local rows of a tile
-      l_cols_tile = tile_size/np_cols ! local cols of a tile
-
-      do istep = (na-1)/nbw, 1, -1
-
-        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
-
-        ! Number of local columns/rows of remaining matrix
-        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
-        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
-
-        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
-
-        allocate(vmr(max(l_rows,1),2*n_cols))
-        allocate(umc(max(l_cols,1),2*n_cols))
-
-        allocate(vr(l_rows+1))
-
-        vmr(1:l_rows,1:n_cols) = 0.
-        vr(:) = 0
-        tmat(:,:,istep) = 0
-
-        ! Reduce current block to lower triangular form
-
-        do lc = n_cols, 1, -1
-
-          ncol = istep*nbw + lc ! absolute column number of householder vector
-          nrow = ncol - nbw ! Absolute number of pivot row
-
-          lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
-          lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
-
-          tau = 0
-
-          if(nrow == 1) exit ! Nothing to do
-
-          cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
-
-          if (my_pcol==cur_pcol) then
-
-            ! Get vector to be transformed; distribute last element and norm of
-            ! remaining elements to all procs in current column
-
-            vr(1:lr) = a(1:lr,lch) ! vector to be transformed
-
-            if (my_prow==prow(nrow, nblk, np_rows)) then
-              aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
-              aux1(2) = vr(lr)
-            else
-              aux1(1) = dot_product(vr(1:lr),vr(1:lr))
-              aux1(2) = 0.
-            endif
-#ifdef WITH_MPI
-            call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-            aux2 = aux1
-#endif
-            vnorm2 = aux2(1)
-            vrl    = aux2(2)
-
-            ! Householder transformation
-
-            call hh_transform_complex(vrl, vnorm2, xf, tau)
-
-            ! Scale vr and store Householder vector for back transformation
-
-            vr(1:lr) = vr(1:lr) * xf
-            if (my_prow==prow(nrow, nblk, np_rows)) then
-              a(1:lr-1,lch) = vr(1:lr-1)
-              a(lr,lch) = vrl
-              vr(lr) = 1.
-            else
-              a(1:lr,lch) = vr(1:lr)
-            endif
-
-          endif
-
-          ! Broadcast Householder vector and tau along columns
-
-          vr(lr+1) = tau
-#ifdef WITH_MPI
-          call MPI_Bcast(vr,lr+1,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr)
-#endif
-          vmr(1:lr,lc) = vr(1:lr)
-          tau = vr(lr+1)
-          tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
-
-          ! Transform remaining columns in current block with Householder vector
-
-          ! Local dot product
-
-          aux1 = 0
-
-          nlc = 0 ! number of local columns
-          do j=1,lc-1
-            lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-            if (lcx>0) then
-              nlc = nlc+1
-              aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
-            endif
-          enddo
-
-          ! Get global dot products
-#ifdef WITH_MPI
-          if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-          if (nlc>0) aux2=aux1
-#endif
-          ! Transform
-
-          nlc = 0
-          do j=1,lc-1
-            lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
-            if (lcx>0) then
-              nlc = nlc+1
-              a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
-            endif
-          enddo
-
-        enddo
-
-        ! Calculate scalar products of stored Householder vectors.
-        ! This can be done in different ways, we use zherk
-
-        vav = 0
-        if (l_rows>0) &
-           call zherk('U','C',n_cols,l_rows,CONE,vmr,ubound(vmr,dim=1),CZERO,vav,ubound(vav,dim=1))
-        call herm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_rows)
-
-        ! Calculate triangular matrix T for block Householder Transformation
-
-        do lc=n_cols,1,-1
-          tau = tmat(lc,lc,istep)
-          if (lc<n_cols) then
-            call ztrmv('U','C','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
-            tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
-          endif
-        enddo
-
-        ! Transpose vmr -> vmc (stored in umc, second half)
-
-        call elpa_transpose_vectors_complex  (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
-                                      umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
-                                      1, istep*nbw, n_cols, nblk)
-
-        ! Calculate umc = A**T * vmr
-        ! Note that the distributed A has to be transposed
-        ! Opposed to direct tridiagonalization there is no need to use the cache locality
-        ! of the tiles, so we can use strips of the matrix
-
-        umc(1:l_cols,1:n_cols) = 0.d0
-        vmr(1:l_rows,n_cols+1:2*n_cols) = 0
-        if (l_cols>0 .and. l_rows>0) then
-          do i=0,(istep*nbw-1)/tile_size
-
-            lcs = i*l_cols_tile+1
-            lce = min(l_cols,(i+1)*l_cols_tile)
-            if (lce<lcs) cycle
-
-            lre = min(l_rows,(i+1)*l_rows_tile)
-            call ZGEMM('C','N',lce-lcs+1,n_cols,lre,CONE,a(1,lcs),ubound(a,dim=1), &
-                         vmr,ubound(vmr,dim=1),CONE,umc(lcs,1),ubound(umc,dim=1))
-
-            if (i==0) cycle
-            lre = min(l_rows,i*l_rows_tile)
-            call ZGEMM('N','N',lre,n_cols,lce-lcs+1,CONE,a(1,lcs),lda, &
-                         umc(lcs,n_cols+1),ubound(umc,dim=1),CONE,vmr(1,n_cols+1),ubound(vmr,dim=1))
-          enddo
-        endif
-
-        ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
-        ! on the processors containing the diagonal
-        ! This is only necessary if ur has been calculated, i.e. if the
-        ! global tile size is smaller than the global remaining matrix
-
-        if (tile_size < istep*nbw) then
-          call elpa_reduce_add_vectors_complex  (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, &
-                                          umc, ubound(umc,dim=1), mpi_comm_cols, &
-                                          istep*nbw, n_cols, nblk)
-        endif
-#ifdef WITH_MPI
-        if (l_cols>0) then
-          allocate(tmp(l_cols,n_cols))
-          call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-          umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
-          deallocate(tmp)
-        endif
-#endif
-        ! U = U * Tmat**T
-
-        call ztrmm('Right','Upper','C','Nonunit',l_cols,n_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
-
-        ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
-
-        call zgemm('C','N',n_cols,n_cols,l_cols,CONE,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
-            ubound(umc,dim=1),CZERO,vav,ubound(vav,dim=1))
-        call ztrmm('Right','Upper','C','Nonunit',n_cols,n_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),vav,ubound(vav,dim=1))
-
-        call herm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_cols)
-
-        ! U = U - 0.5 * V * VAV
-        call zgemm('N','N',l_cols,n_cols,n_cols,(-0.5d0,0.d0),umc(1,n_cols+1),ubound(umc,dim=1),vav,ubound(vav,dim=1), &
-            CONE,umc,ubound(umc,dim=1))
-
-        ! Transpose umc -> umr (stored in vmr, second half)
-
-        call elpa_transpose_vectors_complex  (umc, ubound(umc,dim=1), mpi_comm_cols, &
-                                       vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
-                                       1, istep*nbw, n_cols, nblk)
-
-        ! A = A - V*U**T - U*V**T
-
-        do i=0,(istep*nbw-1)/tile_size
-          lcs = i*l_cols_tile+1
-          lce = min(l_cols,(i+1)*l_cols_tile)
-          lre = min(l_rows,(i+1)*l_rows_tile)
-          if (lce<lcs .or. lre<1) cycle
-          call zgemm('N','C',lre,lce-lcs+1,2*n_cols,-CONE, &
-                      vmr,ubound(vmr,dim=1),umc(lcs,1),ubound(umc,dim=1), &
-                      CONE,a(1,lcs),lda)
-        enddo
-
-        deallocate(vmr, umc, vr)
-
-      enddo
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("bandred_complex")
-#endif
-
-    end subroutine bandred_complex
-
-    subroutine herm_matrix_allreduce(n,a,lda,ldb,comm)
-
-    !-------------------------------------------------------------------------------
-    !  herm_matrix_allreduce: Does an mpi_allreduce for a hermitian matrix A.
-    !  On entry, only the upper half of A needs to be set
-    !  On exit, the complete matrix is set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik) :: n, lda, ldb, comm
-      complex(kind=ck) :: a(lda,ldb)
-
-      integer(kind=ik) :: i, nc, mpierr
-      complex(kind=ck) :: h1(n*n), h2(n*n)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("herm_matrix_allreduce")
-#endif
-
-      nc = 0
-      do i=1,n
-        h1(nc+1:nc+i) = a(1:i,i)
-        nc = nc+i
-      enddo
-#ifdef WITH_MPI
-      call mpi_allreduce(h1,h2,nc,MPI_DOUBLE_COMPLEX,MPI_SUM,comm,mpierr)
-#else
-      h2=h1
-#endif
-      nc = 0
-      do i=1,n
-        a(1:i,i) = h2(nc+1:nc+i)
-        a(i,1:i-1) = conjg(a(1:i-1,i))
-        nc = nc+i
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("herm_matrix_allreduce")
-#endif
-
-    end subroutine herm_matrix_allreduce
-
-
-    subroutine trans_ev_band_to_full_complex(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols,  &
-                                             numBlocks, mpi_comm_rows, mpi_comm_cols)
-
-    !-------------------------------------------------------------------------------
-    !  trans_ev_band_to_full_complex:
-    !  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nqc         Number of columns of matrix q
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nbw         semi bandwith
-    !
-    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after bandred_complex)
-    !              Distribution is like in Scalapack.
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a and q
-    !
-    !  tmat(nbw,nbw,numBlocks) Factors returned by bandred_complex
-    !
-    !  q           On input: Eigenvectors of band matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik)              :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
-#else
-      complex(kind=ck)              :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
-#endif
-      complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
-      integer(kind=ik)              :: max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
-      integer(kind=ik)              :: l_cols, l_rows, l_colh, n_cols
-      integer(kind=ik)              :: istep, lc, ncol, nrow, nb, ns
-
-      complex(kind=ck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
-
-      integer(kind=ik)              :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_band_to_full_complex")
-#endif
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of A
-      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!
-
-      max_local_rows = max_blocks_row*nblk
-      max_local_cols = max_blocks_col*nblk
-
-      allocate(tmp1(max_local_cols*nbw))
-      allocate(tmp2(max_local_cols*nbw))
-      allocate(hvb(max_local_rows*nbw))
-      allocate(hvm(max_local_rows,nbw))
-
-      hvm = 0   ! Must be set to 0 !!!
-      hvb = 0   ! Safety only
-
-      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
-
-      do istep=1,(na-1)/nbw
-
-        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
-
-        ! Broadcast all Householder vectors for current step compressed in hvb
-
-        nb = 0
-        ns = 0
-
-        do lc = 1, n_cols
-          ncol = istep*nbw + lc ! absolute column number of householder vector
-          nrow = ncol - nbw ! absolute number of pivot row
-
-          l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-          l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number
-
-          if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
-
-          nb = nb+l_rows
-
-          if (lc==n_cols .or. mod(ncol,nblk)==0) then
-#ifdef WITH_MPI
-            call MPI_Bcast(hvb(ns+1),nb-ns,MPI_DOUBLE_COMPLEX,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr)
-#endif
-            ns = nb
-          endif
-        enddo
-
-        ! Expand compressed Householder vectors into matrix hvm
-
-        nb = 0
-        do lc = 1, n_cols
-          nrow = (istep-1)*nbw+lc ! absolute number of pivot row
-          l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
-
-          hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
-          if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
-
-          nb = nb+l_rows
-        enddo
-
-        l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
-
-        ! Q = Q - V * T**T * V**T * Q
-
-        if (l_rows>0) then
-          call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), &
-                      q,ldq,CZERO,tmp1,n_cols)
-        else
-          tmp1(1:l_cols*n_cols) = 0
-        endif
-#ifdef WITH_MPI
-        call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
-#else
-        tmp2=tmp1
-#endif
-        if (l_rows>0) then
-          call ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),tmp2,n_cols)
-          call zgemm('N','N',l_rows,l_cols,n_cols,-CONE,hvm,ubound(hvm,dim=1), &
-                      tmp2,n_cols,CONE,q,ldq)
-        endif
-
-      enddo
-
-      deallocate(tmp1, tmp2, hvb, hvm)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("trans_ev_band_to_full_complex")
-#endif
-
-    end subroutine trans_ev_band_to_full_complex
-
-
-    subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, &
-                                    mpi_comm_rows, mpi_comm_cols, mpi_comm)
-
-    !-------------------------------------------------------------------------------
-    ! tridiag_band_complex:
-    ! Reduces a complex hermitian symmetric band matrix to tridiagonal form
-    !
-    !  na          Order of matrix a
-    !
-    !  nb          Semi bandwith
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  a(lda,matrixCols)    Distributed system matrix reduced to banded form in the upper diagonal
-    !
-    !  lda         Leading dimension of a
-    !  matrixCols  local columns of matrix a
-    !
-    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0 (output)
-    !
-    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns
-    !  mpi_comm
-    !              MPI-Communicator for the total processor set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)   ::  na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck),intent(in)    :: a(lda,*)
-#else
-      complex(kind=ck), intent(in)   :: a(lda,matrixCols)
-#endif
-      real(kind=rk), intent(out)     :: d(na), e(na) ! set only on PE 0
-      complex(kind=ck), intent(inout), &
-          allocatable                :: hh_trans_complex(:,:)
-
-      real(kind=rk)                  :: vnorm2
-      complex(kind=ck)               :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
-      complex(kind=ck)               :: hd(nb), hs(nb)
-
-      integer(kind=ik)               :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
-      integer(kind=ik)               :: my_pe, n_pes, mpierr
-      integer(kind=ik)               :: my_prow, np_rows, my_pcol, np_cols
-      integer(kind=ik)               :: ireq_ab, ireq_hv
-      integer(kind=ik)               :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
-#ifdef WITH_OPENMP
-      integer(kind=ik), allocatable  :: mpi_statuses(:,:)
-      integer(kind=ik), allocatable  :: omp_block_limits(:)
-      integer(kind=ik)               :: max_threads, my_thread, my_block_s, my_block_e, iter
-      integer(kind=ik)               :: omp_get_max_threads
-#ifdef WITH_MPI
-      integer(kind=ik)               :: mpi_status(MPI_STATUS_SIZE)
-#endif
-      complex(kind=ck), allocatable  :: hv_t(:,:), tau_t(:)
-#endif
-      integer(kind=ik), allocatable  :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
-      integer(kind=ik), allocatable  :: limits(:), snd_limits(:,:)
-      integer(kind=ik), allocatable  :: block_limits(:)
-      complex(kind=ck), allocatable  :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
-#ifndef WITH_MPI
-      integer(kind=ik)               :: startAddr
-#endif
-
-!   ! dummies for calling redist_band
-!   real*8                   :: r_a(1,1), r_ab(1,1)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("tridiag_band_complex")
-#endif
-      call mpi_comm_rank(mpi_comm,my_pe,mpierr)
-      call mpi_comm_size(mpi_comm,n_pes,mpierr)
-
-      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-      ! Get global_id mapping 2D procssor coordinates to global id
-
-      allocate(global_id(0:np_rows-1,0:np_cols-1))
-      global_id(:,:) = 0
-      global_id(my_prow, my_pcol) = my_pe
-#ifdef WITH_MPI
-      call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-#endif
-
-      ! Total number of blocks in the band:
-
-      nblocks_total = (na-1)/nb + 1
-
-      ! Set work distribution
-
-      allocate(block_limits(0:n_pes))
-      call divide_band(nblocks_total, n_pes, block_limits)
-
-      ! nblocks: the number of blocks for my task
-      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
-
-      ! allocate the part of the band matrix which is needed by this PE
-      ! The size is 1 block larger than needed to avoid extensive shifts
-      allocate(ab(2*nb,(nblocks+1)*nb))
-      ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
-
-      ! n_off: Offset of ab within band
-      n_off = block_limits(my_pe)*nb
-
-      ! Redistribute band in a to ab
-      call redist_band_complex(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
-
-      ! Calculate the workload for each sweep in the back transformation
-      ! and the space requirements to hold the HH vectors
-
-      allocate(limits(0:np_rows))
-      call determine_workload(na, nb, np_rows, limits)
-      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
-
-      num_hh_vecs = 0
-      num_chunks  = 0
-      nx = na
-      do n = 1, nblocks_total
-        call determine_workload(nx, nb, np_rows, limits)
-        local_size = limits(my_prow+1) - limits(my_prow)
-        ! add to number of householder vectors
-        ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
-        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
-          num_hh_vecs = num_hh_vecs + local_size
-          num_chunks  = num_chunks+1
-        endif
-        nx = nx - nb
-      enddo
-
-      ! Allocate space for HH vectors
-
-      allocate(hh_trans_complex(nb,num_hh_vecs))
-
-      ! Allocate and init MPI requests
-
-      allocate(ireq_hhr(num_chunks)) ! Recv requests
-      allocate(ireq_hhs(nblocks))    ! Send requests
-
-      num_hh_vecs = 0
-      num_chunks  = 0
-      nx = na
-      nt = 0
-      do n = 1, nblocks_total
-        call determine_workload(nx, nb, np_rows, limits)
-        local_size = limits(my_prow+1) - limits(my_prow)
-        if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
-          num_chunks  = num_chunks+1
-#ifdef WITH_MPI
-          call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX16, nt, &
-                           10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
-#else
-          ! carefull non-block recv data copy must be done at wait or send
-          ! hh_trans_complex(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk)
-#endif
-          num_hh_vecs = num_hh_vecs + local_size
-        endif
-        nx = nx - nb
-        if (n == block_limits(nt+1)) then
-          nt = nt + 1
-        endif
-      enddo
-#ifdef WITH_MPI
-      ireq_hhs(:) = MPI_REQUEST_NULL
-#endif
-      ! Buffers for gathering/sending the HH vectors
-
-      allocate(hh_gath(nb,max_blk_size,nblocks)) ! gathers HH vectors
-      allocate(hh_send(nb,max_blk_size,nblocks)) ! send buffer for HH vectors
-      hh_gath(:,:,:) = 0
-      hh_send(:,:,:) = 0
-
-      ! Some counters
-
-      allocate(hh_cnt(nblocks))
-      allocate(hh_dst(nblocks))
-
-      hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
-      hh_dst(:) = 0 ! PE number for receive
-#ifdef WITH_MPI
-      ireq_ab = MPI_REQUEST_NULL
-      ireq_hv = MPI_REQUEST_NULL
-#endif
-      ! Limits for sending
-
-      allocate(snd_limits(0:np_rows,nblocks))
-
-      do iblk=1,nblocks
-        call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
-      enddo
-
-#ifdef WITH_OPENMP
-      ! OpenMP work distribution:
-
-       max_threads = 1
-!$ max_threads = omp_get_max_threads()
-
-       ! For OpenMP we need at least 2 blocks for every thread
-       max_threads = MIN(max_threads, nblocks/2)
-       if (max_threads==0) max_threads = 1
-
-       allocate(omp_block_limits(0:max_threads))
-
-       ! Get the OpenMP block limits
-       call divide_band(nblocks, max_threads, omp_block_limits)
-
-       allocate(hv_t(nb,max_threads), tau_t(max_threads))
-       hv_t = 0
-       tau_t = 0
-#endif
-
-
-       ! ---------------------------------------------------------------------------
-       ! Start of calculations
-
-       na_s = block_limits(my_pe)*nb + 1
-
-       if (my_pe>0 .and. na_s<=na) then
-         ! send first column to previous PE
-         ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
-         ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
-#ifdef WITH_MPI
-         call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-#endif
-       endif
-
-#ifndef WITH_MPI
-       startAddr = ubound(hh_trans_complex,dim=2)
-#endif
-
-#ifdef WITH_OPENMP
-       do istep=1,na-1-block_limits(my_pe)*nb
-#else
-       do istep=1,na-1
-#endif
-         if (my_pe==0) then
-           n = MIN(na-na_s,nb) ! number of rows to be reduced
-           hv(:) = 0
-           tau = 0
-           ! Transform first column of remaining matrix
-           ! Opposed to the real case, the last step (istep=na-1) is needed here for making
-           ! the last subdiagonal element a real number
-           vnorm2 = sum(dble(ab(3:n+1,na_s-n_off))**2+dimag(ab(3:n+1,na_s-n_off))**2)
-           if (n<2) vnorm2 = 0. ! Safety only
-           call hh_transform_complex(ab(2,na_s-n_off),vnorm2,hf,tau)
-
-           hv(1) = 1
-           hv(2:n) = ab(3:n+1,na_s-n_off)*hf
-
-           d(istep) = ab(1,na_s-n_off)
-           e(istep) = ab(2,na_s-n_off)
-           if (istep == na-1) then
-             d(na) = ab(1,na_s+1-n_off)
-             e(na) = 0
-           endif
-         else
-           if (na>na_s) then
-             ! Receive Householder vector from previous task, from PE owning subdiagonal
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-             call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,mpi_status,mpierr)
-#else
-             call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-             hv(1:nb) = hv_s(1:nb)
-#else
-             hv(1:nb) = hv_s(1:nb)
-#endif
-#endif /* WITH_MPI */
-             tau = hv(1)
-             hv(1) = 1.
-           endif
-         endif
-
-         na_s = na_s+1
-         if (na_s-n_off > nb) then
-           ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
-           ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
-           n_off = n_off + nb
-         endif
-#ifdef WITH_OPENMP
-         if (max_threads > 1) then
-
-           ! Codepath for OpenMP
-
-           ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
-           ! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
-           ! This simulates the behaviour of the MPI tasks which also work after each other.
-           ! The code would be considerably easier, if the MPI communication would be made within
-           ! the parallel region - this is avoided here since this would require
-           ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
-
-           hv_t(:,1) = hv
-           tau_t(1) = tau
-
-           do iter = 1, 2
-
-             ! iter=1 : work on first block
-             ! iter=2 : work on remaining blocks
-             ! This is done in 2 iterations so that we have a barrier in between:
-             ! After the first iteration, it is guaranteed that the last row of the last block
-             ! is completed by the next thread.
-             ! After the first iteration it is also the place to exchange the last row
-             ! with MPI calls
-#ifdef HAVE_DETAILED_TIMINGS
-             call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
-!$omp&                    nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
-             do my_thread = 1, max_threads
-
-               if (iter == 1) then
-                 my_block_s = omp_block_limits(my_thread-1) + 1
-                 my_block_e = my_block_s
-               else
-                 my_block_s = omp_block_limits(my_thread-1) + 2
-                 my_block_e = omp_block_limits(my_thread)
-               endif
-
-               do iblk = my_block_s, my_block_e
-
-                 ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
-                 ne = ns+nb-1                    ! last column in block
-
-                 if (istep<my_thread .or. ns+n_off>na) exit
-
-                 hv = hv_t(:,my_thread)
-                 tau = tau_t(my_thread)
-
-                 ! Store Householder vector for back transformation
-
-                 hh_cnt(iblk) = hh_cnt(iblk) + 1
-
-                 hh_gath(1   ,hh_cnt(iblk),iblk) = tau
-                 hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
-
-                 nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-                 nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                                ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-                 ! Transform diagonal block
-
-                 call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
-
-                 x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
-                 hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
-
-                 call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1)
-
-                 hv_t(:,my_thread) = 0
-                 tau_t(my_thread)  = 0
-
-                 if (nr<=0) cycle ! No subdiagonal block present any more
-
-                 ! Transform subdiagonal block
-
-                 call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
-
-                 if (nr>1) then
-
-                   ! complete (old) Householder transformation for first column
-
-                   ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
-
-                   ! calculate new Householder transformation for first column
-                   ! (stored in hv_t(:,my_thread) and tau_t(my_thread))
-
-                   vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
-                   call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
-                   hv_t(1   ,my_thread) = 1.
-                   hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
-                   ab(nb+2:,ns) = 0
-
-                   ! update subdiagonal block for old and new Householder transformation
-                   ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
-
-                   call ZGEMV('C',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,(0.d0,0.d0),h(2),1)
-                   x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
-                   h(2:nb) = h(2:nb) - x*hv(2:nb)
-                   ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
-                   do i=2,nb
-                     ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) &
-                                                    - hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
-                   enddo
-
-                 else
-
-                   ! No new Householder transformation for nr=1, just complete the old one
-                   ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
-                   do i=2,nb
-                     ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
-                   enddo
-                   ! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
-                   hv_t(1,my_thread) = 1.
-
-                 endif
-
-               enddo
-
-             enddo ! my_thread
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-             call timer%stop("OpenMP parallel")
-#endif
-
-             if (iter==1) then
-               ! We are at the end of the first block
-
-               ! Send our first column to previous PE
-               if (my_pe>0 .and. na_s <= na) then
-#ifdef WITH_MPI
-                 call mpi_wait(ireq_ab,mpi_status,mpierr)
-#endif
-                 ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
-#ifdef WITH_MPI
-                 call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-#endif
-               endif
-
-               ! Request last column from next PE
-               ne = na_s + nblocks*nb - (max_threads-1) - 1
-#ifdef WITH_MPI
-               if (istep>=max_threads .and. ne <= na) then
-                 call mpi_recv(ab(1,ne-n_off),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr)
-               endif
-#else
-               if (istep>=max_threads .and. ne <= na) then
-                 ab(1:nb+1,ne-n_off) = ab_s(1:nb+1)
-               endif
-#endif
-             else
-               ! We are at the end of all blocks
-
-               ! Send last HH vector and TAU to next PE if it has been calculated above
-               ne = na_s + nblocks*nb - (max_threads-1) - 1
-               if (istep>=max_threads .and. ne < na) then
-#ifdef WITH_MPI
-                 call mpi_wait(ireq_hv,mpi_status,mpierr)
-#endif
-                 hv_s(1) = tau_t(max_threads)
-                 hv_s(2:) = hv_t(2:,max_threads)
-#ifdef WITH_MPI
-                 call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-#endif
-               endif
-
-               ! "Send" HH vector and TAU to next OpenMP thread
-               do my_thread = max_threads, 2, -1
-                 hv_t(:,my_thread) = hv_t(:,my_thread-1)
-                 tau_t(my_thread)  = tau_t(my_thread-1)
-               enddo
-
-             endif
-           enddo ! iter
-
-         else
-
-           ! Codepath for 1 thread without OpenMP
-
-           ! The following code is structured in a way to keep waiting times for
-           ! other PEs at a minimum, especially if there is only one block.
-           ! For this reason, it requests the last column as late as possible
-           ! and sends the Householder vector and the first column as early
-           ! as possible.
-
-#endif
-
-           do iblk=1,nblocks
-
-             ns = na_s + (iblk-1)*nb - n_off ! first column in block
-             ne = ns+nb-1                    ! last column in block
-
-             if (ns+n_off>na) exit
-
-             ! Store Householder vector for back transformation
-
-             hh_cnt(iblk) = hh_cnt(iblk) + 1
-
-             hh_gath(1   ,hh_cnt(iblk),iblk) = tau
-             hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
-
-
-#ifndef WITH_OPENMP
-             if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
-               ! Wait for last transfer to finish
-#ifdef WITH_MPI
-               call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
-#endif
-               ! Copy vectors into send buffer
-               hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
-               ! Send to destination
-#ifdef WITH_MPI
-               call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX16, &
-                               global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
-                               10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
-#else
-               startAddr = startAddr - hh_cnt(iblk)
-               hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
-#endif
-               ! Reset counter and increase destination row
-               hh_cnt(iblk) = 0
-               hh_dst(iblk) = hh_dst(iblk)+1
-             endif
-
-
-             ! The following code is structured in a way to keep waiting times for
-             ! other PEs at a minimum, especially if there is only one block.
-             ! For this reason, it requests the last column as late as possible
-             ! and sends the Householder vector and the first column as early
-             ! as possible.
-#endif
-
-             nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-             nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                           ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-
-             ! Multiply diagonal block and subdiagonal block with Householder vector
-
-             if (iblk==nblocks .and. nc==nb) then
-
-               ! We need the last column from the next PE.
-               ! First do the matrix multiplications without last column ...
-
-               ! Diagonal block, the contribution of the last element is added below!
-               ab(1,ne) = 0
-               call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
-
-               ! Subdiagonal block
-               if (nr>0) call ZGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
-
-               ! ... then request last column ...
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-               call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr)
-
-#else
-               call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-               ab(1:nb+1,ne) = ab_s(1:nb+1)
-#else
-               ab(1:nb+1,ne) = ab_s(1:nb+1)
-#endif
-
-#endif /* WITH_MPI */
-               ! ... and complete the result
-               hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
-               hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
-
-             else
-               ! Normal matrix multiply
-               call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
-               if (nr>0) call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
-
-             endif
-
-             ! Calculate first column of subdiagonal block and calculate new
-             ! Householder transformation for this column
-
-             hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb
-             tau_new = 0
-
-             if (nr>0) then
-
-               ! complete (old) Householder transformation for first column
-
-               ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
-
-               ! calculate new Householder transformation ...
-               if (nr>1) then
-                 vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
-                 call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_new)
-                 hv_new(1) = 1.
-                 hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
-                 ab(nb+2:,ns) = 0
-               endif
-
-              ! ... and send it away immediatly if this is the last block
-
-              if (iblk==nblocks) then
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-                call mpi_wait(ireq_hv,mpi_status,mpierr)
-#else
-                call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
-#endif
-#endif
-                hv_s(1) = tau_new
-                hv_s(2:) = hv_new(2:)
-#ifdef WITH_MPI
-                call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-#endif
-              endif
-
-            endif
-
-
-            ! Transform diagonal block
-            x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
-            hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
-
-            if (my_pe>0 .and. iblk==1) then
-
-              ! The first column of the diagonal block has to be send to the previous PE
-              ! Calculate first column only ...
-
-              ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1))
-
-              ! ... send it away ...
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-              call mpi_wait(ireq_ab,mpi_status,mpierr)
-#else
-              call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
-#endif
-#endif
-              ab_s(1:nb+1) = ab(1:nb+1,ns)
-#ifdef WITH_MPI
-              call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-#endif
-              ! ... and calculate remaining columns with rank-2 update
-              if (nc>1) call ZHER2('L',nc-1,(-1.d0,0.d0),hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1)
-            else
-              ! No need to  send, just a rank-2 update
-              call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1)
-            endif
-
-            ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
-
-            if (nr>0) then
-              if (nr>1) then
-                call ZGEMV('C',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,(0.d0,0.d0),h(2),1)
-                x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
-                h(2:nb) = h(2:nb) - x*hv(2:nb)
-                ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
-                do i=2,nb
-                  ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
-                enddo
-              else
-                ! No double Householder transformation for nr=1, just complete the row
-                do i=2,nb
-                   ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
-                enddo
-              endif
-            endif
-
-            ! Use new HH vector for the next block
-            hv(:) = hv_new(:)
-            tau = tau_new
-
-          enddo
-#ifdef WITH_OPENMP
-        endif
-#endif
-
-#ifdef WITH_OPENMP
-        do iblk = 1, nblocks
-
-          if (hh_dst(iblk) >= np_rows) exit
-          if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
-
-          if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
-            ! Wait for last transfer to finish
-#ifdef WITH_MPI
-            call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr)
-#endif
-            ! Copy vectors into send buffer
-            hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
-            ! Send to destination
-#ifdef WITH_MPI
-            call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex16, &
-                          global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
-                          10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
-#else
-            startAddr = startAddr - hh_cnt(iblk)
-            hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk)
-#endif
-            ! Reset counter and increase destination row
-            hh_cnt(iblk) = 0
-            hh_dst(iblk) = hh_dst(iblk)+1
-          endif
-        enddo
-#endif
-      enddo
-
-      ! Finish the last outstanding requests
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-      call mpi_wait(ireq_ab,mpi_status,mpierr)
-      call mpi_wait(ireq_hv,mpi_status,mpierr)
-
-      allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)))
-      call mpi_waitall(nblocks, ireq_hhs, mpi_statuses, mpierr)
-      call mpi_waitall(num_chunks, ireq_hhr, mpi_statuses, mpierr)
-      deallocate(mpi_statuses)
-#else
-      call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
-      call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
-
-      call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
-      call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
-
-#endif
-      call mpi_barrier(mpi_comm,mpierr)
-#endif
-      deallocate(ab)
-      deallocate(ireq_hhr, ireq_hhs)
-      deallocate(hh_cnt, hh_dst)
-      deallocate(hh_gath, hh_send)
-      deallocate(limits, snd_limits)
-      deallocate(block_limits)
-      deallocate(global_id)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("tridiag_band_complex")
-#endif
-
-    end subroutine tridiag_band_complex
-
-    subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, matrixCols,  &
-                                              hh_trans_complex, mpi_comm_rows, mpi_comm_cols, &
-                                              wantDebug, success, THIS_COMPLEX_ELPA_KERNEL)
-
-    !-------------------------------------------------------------------------------
-    !  trans_ev_tridi_to_band_complex:
-    !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
-    !
-    !  Parameters
-    !
-    !  na          Order of matrix a, number of rows of matrix q
-    !
-    !  nev         Number eigenvectors to compute (= columns of matrix q)
-    !
-    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
-    !
-    !  nb          semi bandwith
-    !
-    !  q           On input: Eigenvectors of tridiagonal matrix
-    !              On output: Transformed eigenvectors
-    !              Distribution is like in Scalapack.
-    !
-    !  ldq         Leading dimension of q
-    ! matrixCols   local columns of matrix q
-    !
-    !  mpi_comm_rows
-    !  mpi_comm_cols
-    !              MPI-Communicators for rows/columns/both
-    !
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use pack_unpack_complex
-      use compute_hh_trafo_complex
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)  :: THIS_COMPLEX_ELPA_KERNEL
-      integer(kind=ik), intent(in)  :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-      complex(kind=ck)              :: q(ldq,*)
-#else
-      complex(kind=ck)              :: q(ldq,matrixCols)
-#endif
-      complex(kind=ck)              :: hh_trans_complex(:,:)
-      integer(kind=ik)              :: np_rows, my_prow, np_cols, my_pcol
-
-      integer(kind=ik)              :: i, j, ip, sweep, nbuf, l_nev, a_dim2
-      integer(kind=ik)              :: current_n, current_local_n, current_n_start, current_n_end
-      integer(kind=ik)              :: next_n, next_local_n, next_n_start, next_n_end
-      integer(kind=ik)              :: bottom_msg_length, top_msg_length, next_top_msg_length
-      integer(kind=ik)              :: stripe_width, last_stripe_width, stripe_count
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: thread_width, csw, b_off, b_len
-#endif
-      integer(kind=ik)              :: num_result_blocks, num_result_buffers, num_bufs_recvd
-      integer(kind=ik)              :: a_off, current_tv_off, max_blk_size
-      integer(kind=ik)              :: mpierr, src, src_offset, dst, offset, nfact, num_blk
-      logical                       :: flag
-
-#ifdef WITH_OPENMP
-      complex(kind=ck), pointer     :: a(:,:,:,:)
-#else
-      complex(kind=ck), pointer     :: a(:,:,:)
-#endif
-      complex(kind=ck)              :: a_complex
-      complex(kind=ck), allocatable :: row(:)
-      type(c_ptr)                   :: a_ptr
-
-#ifdef WITH_OPENMP
-      complex(kind=ck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
-      complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
-#else
-      complex(kind=ck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
-      complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
-#endif
-      complex(kind=ck), allocatable :: result_buffer(:,:,:)
-      complex(kind=ck), allocatable :: bcast_buffer(:,:)
-
-      integer(kind=ik)              :: n_off
-      integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
-      integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
-      integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
-#ifdef WITH_OPENMP
-      integer(kind=ik), allocatable :: mpi_statuses(:,:)
-#ifdef WITH_MPI
-      integer(kind=ik)              :: mpi_status(MPI_STATUS_SIZE)
-#endif
-#endif
-
-      ! MPI send/recv tags, arbitrary
-
-      integer(kind=ik), parameter   :: bottom_recv_tag = 111
-      integer(kind=ik), parameter   :: top_recv_tag    = 222
-      integer(kind=ik), parameter   :: result_recv_tag = 333
-
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: max_threads, my_thread
-      integer(kind=ik)              :: omp_get_max_threads
-#endif
-
-      ! Just for measuring the kernel performance
-      real(kind=rk)                 :: kernel_time
-      ! long integer
-      integer(kind=lik)             :: kernel_flops
-
-      logical, intent(in)           :: wantDebug
-      logical                       :: success
-#ifndef WITH_MPI
-      integer(kind=ik)              :: j1
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("trans_ev_tridi_to_band_complex")
-#endif
-
-      kernel_time = 1.d-100
-      kernel_flops = 0
-
-#ifdef WITH_OPENMP
-      max_threads = 1
-      max_threads = omp_get_max_threads()
-#endif
-      call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
-      call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
-      call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
-      call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
-      success = .true.
-
-      if (mod(nbw,nblk)/=0) then
-        if (my_prow==0 .and. my_pcol==0) then
-          if (wantDebug) then
-            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk
-            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk'
-          endif
-
-          success = .false.
-          return
-        endif
-      endif
-
-      nfact = nbw / nblk
-
-
-      ! local number of eigenvectors
-      l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
-
-      if (l_nev==0) then
-#ifdef WITH_OPENMP
-        thread_width = 0
-#endif
-        stripe_width = 0
-        stripe_count = 0
-        last_stripe_width = 0
-      else
-        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
-#ifdef WITH_OPENMP
-        thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
-#endif
-
-        stripe_width = 48 ! Must be a multiple of 4
-#ifdef WITH_OPENMP
-        stripe_count = (thread_width-1)/stripe_width + 1
-#else
-        stripe_count = (l_nev-1)/stripe_width + 1
-#endif
-        ! Adapt stripe width so that last one doesn't get too small
-#ifdef WITH_OPENMP
-        stripe_width = (thread_width-1)/stripe_count + 1
-#else
-        stripe_width = (l_nev-1)/stripe_count + 1
-#endif
-        stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
-#ifndef WITH_OPENMP
-        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
-#endif
-      endif
-
-      ! Determine the matrix distribution at the beginning
-
-      allocate(limits(0:np_rows))
-
-      call determine_workload(na, nbw, np_rows, limits)
-
-      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
-
-      a_dim2 = max_blk_size + nbw
-
-#ifdef WITH_OPENMP
-      if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_complex)) /= 0) then
-#else
-      if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_complex)) /= 0) then
-#endif
-        write(error_unit,*) "Cannot allocate memory"
-        success = .false.
-        return
-      endif
-
-      call c_f_pointer(a_ptr, a, &
-#ifdef WITH_OPENMP
-          [stripe_width,a_dim2,stripe_count,max_threads] &
-#else
-          [stripe_width,a_dim2,stripe_count] &
-#endif
-        )
-
-#ifndef WITH_OPENMP
-      a(:,:,:) = 0
-#endif
-
-      allocate(row(l_nev))
-      row(:) = 0
-
-      ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
-      ! and transpose the matrix using stripes of given stripe_width for cache blocking.
-
-      ! The peculiar way it is done below is due to the fact that the last row should be
-      ! ready first since it is the first one to start below
-
-#ifdef WITH_OPENMP
-      ! Please note about the OMP usage below:
-      ! This is not for speed, but because we want the matrix a in the memory and
-      ! in the cache of the correct thread (if possible)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-      do my_thread = 1, max_threads
-        a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation!
-      enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("OpenMP parallel")
-#endif
-#endif
-
-      do ip = np_rows-1, 0, -1
-        if (my_prow == ip) then
-          ! Receive my rows which have not yet been received
-          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
-          do i=limits(ip)+1,limits(ip+1)
-            src = mod((i-1)/nblk, np_rows)
-            if (src < my_prow) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-              call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr)
-
-#else
-              call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-              row(1:l_nev) = row(1:l_nev)
-#else
-              row(1:l_nev) = row(1:l_nev)
-#endif
-
-#endif /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_complex_cpu_openmp(a, row,i-limits(ip),my_thread, &
-                                                   stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call unpack_row_complex_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
-#endif
-            elseif (src==my_prow) then
-              src_offset = src_offset+1
-              row(:) = q(src_offset, 1:l_nev)
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_complex_cpu_openmp(a, row,i-limits(ip),my_thread, &
-                                                   stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call unpack_row_complex_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
-#endif
-            endif
-          enddo
-          ! Send all rows which have not yet been send
-          src_offset = 0
-          do dst = 0, ip-1
-            do i=limits(dst)+1,limits(dst+1)
-              if(mod((i-1)/nblk, np_rows) == my_prow) then
-                  src_offset = src_offset+1
-                  row(:) = q(src_offset, 1:l_nev)
-#ifdef WITH_MPI
-                  call MPI_Send(row, l_nev, MPI_COMPLEX16, dst, 0, mpi_comm_rows, mpierr)
-#endif
-              endif
-            enddo
-          enddo
-        else if(my_prow < ip) then
-          ! Send all rows going to PE ip
-          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
-          do i=limits(ip)+1,limits(ip+1)
-            src = mod((i-1)/nblk, np_rows)
-            if (src == my_prow) then
-              src_offset = src_offset+1
-              row(:) = q(src_offset, 1:l_nev)
-#ifdef WITH_MPI
-              call MPI_Send(row, l_nev, MPI_COMPLEX16, ip, 0, mpi_comm_rows, mpierr)
-#endif
-            endif
-          enddo
-          ! Receive all rows from PE ip
-          do i=limits(my_prow)+1,limits(my_prow+1)
-            src = mod((i-1)/nblk, np_rows)
-            if (src == ip) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-              call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr)
-#else
-              call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-              row(1:l_nev) = row(1:l_nev)
-#else
-              row(1:l_nev) = row(1:l_nev)
-#endif
-#endif /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%start("OpenMP parallel")
-#endif
-!$omp parallel do private(my_thread), schedule(static, 1)
-              do my_thread = 1, max_threads
-                call unpack_row_complex_cpu_openmp(a, row,i-limits(my_prow),my_thread, &
-                                                   stripe_count, thread_width, stripe_width, l_nev)
-              enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-              call timer%stop("OpenMP parallel")
-#endif
-
-#else
-              call unpack_row_complex_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
-#endif
-            endif
-          enddo
-        endif
-      enddo
-
-
-      ! Set up result buffer queue
-
-      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
-
-      num_result_buffers = 4*nfact
-      allocate(result_buffer(l_nev,nblk,num_result_buffers))
-
-      allocate(result_send_request(num_result_buffers))
-      allocate(result_recv_request(num_result_buffers))
-#ifdef WITH_MPI
-      result_send_request(:) = MPI_REQUEST_NULL
-      result_recv_request(:) = MPI_REQUEST_NULL
-#endif
-      ! Queue up buffers
-#ifdef WITH_MPI
-      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
-        do j = 1, min(num_result_buffers, num_result_blocks)
-          call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, &
-                             mpi_comm_rows, result_recv_request(j), mpierr)
-        enddo
-      endif
-#else
-      ! carefull the "recieve" has to be done at the corresponding wait or send
-      !if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
-      !  do j = 1, min(num_result_buffers, num_result_blocks)
-      !    result_buffer(1:l_nev*nblk,1,j) = result_buffer(1:l_nev*nblk,1,nbuf)
-      !  enddo
-      !endif
-
-#endif
-      num_bufs_recvd = 0 ! No buffers received yet
-
-      ! Initialize top/bottom requests
-
-      allocate(top_send_request(stripe_count))
-      allocate(top_recv_request(stripe_count))
-      allocate(bottom_send_request(stripe_count))
-      allocate(bottom_recv_request(stripe_count))
-#ifdef WITH_MPI
-      top_send_request(:) = MPI_REQUEST_NULL
-      top_recv_request(:) = MPI_REQUEST_NULL
-      bottom_send_request(:) = MPI_REQUEST_NULL
-      bottom_recv_request(:) = MPI_REQUEST_NULL
-#endif
-
-#ifdef WITH_OPENMP
-      allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count))
-      allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count))
-
-      top_border_send_buffer(:,:) = 0
-      top_border_recv_buffer(:,:) = 0
-      bottom_border_send_buffer(:,:) = 0
-      bottom_border_recv_buffer(:,:) = 0
-#else
-      allocate(top_border_send_buffer(stripe_width, nbw, stripe_count))
-      allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count))
-      allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count))
-      allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count))
-
-      top_border_send_buffer(:,:,:) = 0
-      top_border_recv_buffer(:,:,:) = 0
-      bottom_border_send_buffer(:,:,:) = 0
-      bottom_border_recv_buffer(:,:,:) = 0
-#endif
-
-      ! Initialize broadcast buffer
-
-      allocate(bcast_buffer(nbw, max_blk_size))
-      bcast_buffer = 0
-
-      current_tv_off = 0 ! Offset of next row to be broadcast
-
-
-      ! ------------------- start of work loop -------------------
-
-      a_off = 0 ! offset in A (to avoid unnecessary shifts)
-
-      top_msg_length = 0
-      bottom_msg_length = 0
-
-      do sweep = 0, (na-1)/nbw
-
-        current_n = na - sweep*nbw
-        call determine_workload(current_n, nbw, np_rows, limits)
-        current_n_start = limits(my_prow)
-        current_n_end   = limits(my_prow+1)
-        current_local_n = current_n_end - current_n_start
-
-        next_n = max(current_n - nbw, 0)
-        call determine_workload(next_n, nbw, np_rows, limits)
-        next_n_start = limits(my_prow)
-        next_n_end   = limits(my_prow+1)
-        next_local_n = next_n_end - next_n_start
-
-        if (next_n_end < next_n) then
-          bottom_msg_length = current_n_end - next_n_end
-        else
-          bottom_msg_length = 0
-        endif
-
-        if (next_local_n > 0) then
-          next_top_msg_length = current_n_start - next_n_start
-        else
-          next_top_msg_length = 0
-        endif
-
-        if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
-          do i = 1, stripe_count
-#ifdef WITH_OPENMP
-            csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
-            b_len = csw*nbw*max_threads
-#ifdef WITH_MPI
-            call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
-                     mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-!            carefull the "recieve" has to be do done at the corresponding wait or send
-!            bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-            call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
-                         mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-!            carefull the "recieve" has to be do done at the corresponding wait or send
-!            bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
-#endif
-
-#endif /* WITH_OPENMP */
-          enddo
-        endif
-
-        if (current_local_n > 1) then
-          if (my_pcol == mod(sweep,np_cols)) then
-            bcast_buffer(:,1:current_local_n) = hh_trans_complex(:,current_tv_off+1:current_tv_off+current_local_n)
-            current_tv_off = current_tv_off + current_local_n
-          endif
-#ifdef WITH_MPI
-          call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX16, mod(sweep,np_cols), mpi_comm_cols, mpierr)
-#endif
-         else
-           ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_complex
-           bcast_buffer(:,1) = 0
-         endif
-
-         if (l_nev == 0) cycle
-
-         if (current_local_n > 0) then
-
-           do i = 1, stripe_count
-
-#ifdef WITH_OPENMP
-             ! Get real stripe width for strip i;
-             ! The last OpenMP tasks may have an even smaller stripe with,
-             ! but we don't care about this, i.e. we send/recv a bit too much in this case.
-             ! csw: current_stripe_width
-
-             csw = min(stripe_width, thread_width-(i-1)*stripe_width)
-#endif
-
-             !wait_b
-             if (current_n_end < current_n) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-               call MPI_Wait(bottom_recv_request(i), mpi_status, mpierr)
-#else
-               call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 n_off = current_local_n+a_off
-                 b_len = csw*nbw
-                 b_off = (my_thread-1)*b_len
-                 a(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
-                    reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else /* WITH_OPENMP */
-               n_off = current_local_n+a_off
-               a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
-#endif /* WITH_OPENMP */
-               if (next_n_end < next_n) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-                 call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
-                                     MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
-                                     mpi_comm_rows, bottom_recv_request(i), mpierr)
-#else
-                 call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
-
-                                   mpi_comm_rows, bottom_recv_request(i), mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-!            carefull the "recieve" has to be do done at the corresponding wait or send
-!                 bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
-#else
-!            carefull the "recieve" has to be do done at the corresponding wait or send
-!                 bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
-#endif
-
-#endif /* WITH_MPI */
-               endif
-             endif
-
-             if (current_local_n <= bottom_msg_length + top_msg_length) then
-
-               !wait_t
-               if (top_msg_length>0) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-                 call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
-#else
-                 call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-
-#ifndef WITH_OPENMP
-                 a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
-#endif
-               endif
-
-               !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 if (top_msg_length>0) then
-                   b_len = csw*top_msg_length
-                   b_off = (my_thread-1)*b_len
-                   a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
-                            reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
-                 endif
-                 call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
-                                                          a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                          0, current_local_n, i, my_thread, thread_width,                    &
-                                                          THIS_COMPLEX_ELPA_KERNEL)
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else /* WITH_OPENMP */
-               call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count,                             &
-                                                 a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                 0, current_local_n, i, last_stripe_width,                          &
-                                                 THIS_COMPLEX_ELPA_KERNEL)
-#endif /* WITH_OPENMP */
-               !send_b
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-               call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
-#else
-               call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-               if (bottom_msg_length>0) then
-                 n_off = current_local_n+nbw-bottom_msg_length+a_off
-#ifdef WITH_OPENMP
-                 b_len = csw*bottom_msg_length*max_threads
-                 bottom_border_send_buffer(1:b_len,i) = &
-                          reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
-#ifdef WITH_MPI
-                 call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, &
-                                     top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                 if (next_top_msg_length > 0) then
-                   top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
-                       bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
-                 endif
-#endif
-
-#else /* WITH_OPENMP */
-                 bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
-#ifdef WITH_MPI
-                 call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
-                                top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                 if (next_top_msg_length > 0) then
-                   top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = &
-                   bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i)
-                 endif
-#endif
-
-#endif /* WITH_OPENMP */
-               endif
-
-             else
-
-               !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,            &
-                                                          a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time,    &
-                                                          current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
-                                                          thread_width, THIS_COMPLEX_ELPA_KERNEL)
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else /* WITH_OPENMP */
-               call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count,                             &
-                                                 a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                 current_local_n - bottom_msg_length, bottom_msg_length, i,         &
-                                                 last_stripe_width, THIS_COMPLEX_ELPA_KERNEL)
-
-
-#endif /* WITH_OPENMP */
-               !send_b
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-               call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
-#else
-
-               call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-               if (bottom_msg_length > 0) then
-                 n_off = current_local_n+nbw-bottom_msg_length+a_off
-#ifdef WITH_OPENMP
-                 b_len = csw*bottom_msg_length*max_threads
-                 bottom_border_send_buffer(1:b_len,i) = &
-                      reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
-#ifdef WITH_MPI
-                 call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, &
-                                   top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                 if (next_top_msg_length > 0) then
-                   top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
-                       bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
-                 endif
-#endif
-
-#else /* WITH_OPENMP */
-                 bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
-#ifdef WITH_MPI
-                 call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
-                              top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
-#else
-                 if (next_top_msg_length > 0) then
-                   top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = &
-                   bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i)
-                 endif
-#endif
-
-#endif /* WITH_OPENMP */
-                endif
-
-               !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,   &
-                                                          a_off, nbw, max_blk_size, bcast_buffer, kernel_flops,  &
-                                                          kernel_time, top_msg_length,                           &
-                                                          current_local_n-top_msg_length-bottom_msg_length, i,   &
-                                                          my_thread,  thread_width, THIS_COMPLEX_ELPA_KERNEL)
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else /* WITH_OPENMP */
-               call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count,                               &
-                                                 a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time,   &
-                                                 top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
-                                                 last_stripe_width, THIS_COMPLEX_ELPA_KERNEL)
-
-#endif /* WITH_OPENMP */
-               !wait_t
-               if (top_msg_length>0) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-                 call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
-#else
-                 call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-
-#ifndef WITH_OPENMP
-                 a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
-
-#endif
-               endif
-
-               !compute
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
-               do my_thread = 1, max_threads
-                 if (top_msg_length>0) then
-                   b_len = csw*top_msg_length
-                   b_off = (my_thread-1)*b_len
-                   a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
-                          reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
-                 endif
-                 call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
-                                                          a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                          0, top_msg_length, i, my_thread, thread_width,                     &
-                                                          THIS_COMPLEX_ELPA_KERNEL)
-               enddo
-!$omp end parallel do
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("OpenMP parallel")
-#endif
-
-#else /* WITH_OPENMP */
-               call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count,                              &
-                                                 a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time,  &
-                                                 0, top_msg_length, i, last_stripe_width,                            &
-                                                 THIS_COMPLEX_ELPA_KERNEL)
-#endif /* WITH_OPENMP */
-             endif
-
-             if (next_top_msg_length > 0) then
-               !request top_border data
-#ifdef WITH_OPENMP
-               b_len = csw*next_top_msg_length*max_threads
-#ifdef WITH_MPI
-               call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow-1, &
-                                 top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
-#else
-!              carefull the "recieve" has to be done at the corresponding send or wait
-!               top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef WITH_MPI
-               call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX16, my_prow-1, &
-                               top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
-#else
-!              carefull the "recieve" has to be done at the corresponding send or wait
-!               top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = &
-!                    bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i)
-#endif
-
-#endif /* WITH_OPENMP */
-             endif
-
-             !send_t
-             if (my_prow > 0) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-               call MPI_Wait(top_send_request(i), mpi_status, mpierr)
-#else
-               call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-               b_len = csw*nbw*max_threads
-               top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
-#ifdef WITH_MPI
-               call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX16, &
-                                 my_prow-1, bottom_recv_tag, &
-                                 mpi_comm_rows, top_send_request(i), mpierr)
-#else
-               if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
-                 bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
-               endif
-               if (next_n_end < next_n) then
-                 bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
-               endif
-#endif
-
-#else /* WITH_OPENMP */
-               top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i)
-#ifdef WITH_MPI
-               call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow-1, bottom_recv_tag, &
-                                 mpi_comm_rows, top_send_request(i), mpierr)
-#else
-               if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
-                 bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i)
-               endif
-               if (next_n_end < next_n) then
-                 bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i)
-               endif
-#endif
-
-#endif /* WITH_OPENMP */
-             endif
-
-             ! Care that there are not too many outstanding top_recv_request's
-             if (stripe_count > 1) then
-               if (i>1) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-                 call MPI_Wait(top_recv_request(i-1), mpi_status, mpierr)
-#else
-                 call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-               else
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-                 call MPI_Wait(top_recv_request(stripe_count), mpi_status, mpierr)
-#else
-                 call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-               endif
-             endif
-
-           enddo
-
-           top_msg_length = next_top_msg_length
-
-         else
-           ! wait for last top_send_request
-#ifdef WITH_MPI
-           do i = 1, stripe_count
-#ifdef WITH_OPENMP
-             call MPI_Wait(top_send_request(i), mpi_status, mpierr)
-#else
-             call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
-#endif
-           enddo
-#endif /* WITH_MPI */
-         endif
-
-         ! Care about the result
-
-         if (my_prow == 0) then
-
-           ! topmost process sends nbw rows to destination processes
-
-           do j=0,nfact-1
-
-             num_blk = sweep*nfact+j ! global number of destination block, 0 based
-             if (num_blk*nblk >= na) exit
-
-             nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-             call MPI_Wait(result_send_request(nbuf), mpi_status, mpierr)
-#else
-             call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-             dst = mod(num_blk, np_rows)
-
-             if (dst == 0) then
-               do i = 1, min(na - num_blk*nblk, nblk)
-#ifdef WITH_OPENMP
-                 call pack_row_complex_cpu_openmp(a, row, j*nblk+i+a_off, &
-                                                  stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-                 call pack_row_complex_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
-
-#endif
-                 q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
-               enddo
-             else
-               do i = 1, nblk
-#ifdef WITH_OPENMP
-                 call pack_row_complex_cpu_openmp(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, &
-                                                  stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-                 call pack_row_complex_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
-
-#endif
-               enddo
-#ifdef WITH_MPI
-               call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, dst, &
-                                     result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
-#else
-               if (j+num_result_buffers < num_result_blocks) &
-                   result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf)
-               if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
-                 do j1 = 1, min(num_result_buffers, num_result_blocks)
-                   result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf)
-                 enddo
-               endif
-#endif
-             endif
-           enddo
-
-         else
-
-           ! receive and store final result
-
-           do j = num_bufs_recvd, num_result_blocks-1
-
-             nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
-
-             ! If there is still work to do, just test for the next result request
-             ! and leave the loop if it is not ready, otherwise wait for all
-             ! outstanding requests
-
-             if (next_local_n > 0) then
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-               call MPI_Test(result_recv_request(nbuf), flag, mpi_status, mpierr)
-
-#else
-               call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#else /* WITH_MPI */
-               flag = .true.
-#endif /* WITH_MPI */
-               if (.not.flag) exit
-             else
-#ifdef WITH_MPI
-
-#ifdef WITH_OPENMP
-               call MPI_Wait(result_recv_request(nbuf), mpi_status, mpierr)
-
-#else
-
-               call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-             endif
-
-               ! Fill result buffer into q
-               num_blk = j*np_rows + my_prow ! global number of current block, 0 based
-               do i = 1, min(na - num_blk*nblk, nblk)
-                 q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
-               enddo
-
-               ! Queue result buffer again if there are outstanding blocks left
-#ifdef WITH_MPI
-               if (j+num_result_buffers < num_result_blocks) &
-                   call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, &
-                                     mpi_comm_rows, result_recv_request(nbuf), mpierr)
-#else
-!              carefull "recieve" has to be done at corresponding wait or send
-!               if (j+num_result_buffers < num_result_blocks) &
-!                 result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf)
-#endif
-             enddo
-             num_bufs_recvd = j
-
-           endif
-
-           ! Shift the remaining rows to the front of A (if necessary)
-
-           offset = nbw - top_msg_length
-
-           if (offset<0) then
-             if (wantDebug) then
-               write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: internal error, offset for shifting = ',offset
-             endif
-             success = .false.
-             return
-           endif
-
-           a_off = a_off + offset
-           if (a_off + next_local_n + nbw > a_dim2) then
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-             call timer%start("OpenMP parallel")
-#endif
-
-!$omp parallel do private(my_thread, i, j), schedule(static, 1)
-             do my_thread = 1, max_threads
-               do i = 1, stripe_count
-                 do j = top_msg_length+1, top_msg_length+next_local_n
-                   A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread)
-                 enddo
-#else /* WITH_OPENMP */
-             do i = 1, stripe_count
-               do j = top_msg_length+1, top_msg_length+next_local_n
-                 A(:,j,i) = A(:,j+a_off,i)
-#endif /* WITH_OPENMP */
-               enddo
-             enddo
-#ifdef WITH_OPENMP
-#ifdef HAVE_DETAILED_TIMINGS
-             call timer%stop("OpenMP parallel")
-#endif
-#endif
-
-             a_off = 0
-          endif
-        enddo
-
-       ! Just for safety:
-#ifdef WITH_MPI
-       if (ANY(top_send_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
-       if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
-       if (ANY(top_recv_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
-       if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
-#endif
-       if (my_prow == 0) then
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-         allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers))
-         call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr)
-         deallocate(mpi_statuses)
-#else
-         call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
-#endif
-
-#endif /* WITH_MPI */
-       endif
-#ifdef WITH_MPI
-       if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
-       if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
-#endif
-       if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-         write(error_unit,'(" Kernel time:",f10.3," MFlops: ",f10.3)') kernel_time, kernel_flops/kernel_time*1.d-6
-
-       ! deallocate all working space
-
-       nullify(a)
-       call free(a_ptr)
-       deallocate(row)
-       deallocate(limits)
-       deallocate(result_send_request)
-       deallocate(result_recv_request)
-       deallocate(top_border_send_buffer)
-       deallocate(top_border_recv_buffer)
-       deallocate(bottom_border_send_buffer)
-       deallocate(bottom_border_recv_buffer)
-       deallocate(result_buffer)
-       deallocate(bcast_buffer)
-       deallocate(top_send_request)
-       deallocate(top_recv_request)
-       deallocate(bottom_send_request)
-       deallocate(bottom_recv_request)
-#ifdef HAVE_DETAILED_TIMINGS
-       call timer%stop("trans_ev_tridi_to_band_complex")
-#endif
-       return
-!       contains
-!
-!#ifdef WITH_OPENMP
-!         subroutine compute_hh_trafo_complex(off, ncols, istripe, my_thread, THIS_COMPLEX_ELPA_KERNEL)
-!#else
-!         subroutine compute_hh_trafo_complex(off, ncols, istripe, THIS_COMPLEX_ELPA_KERNEL)
-!#endif
-!           use precision
-!#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
-!           use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple
-!#endif
-!#if defined(WITH_COMPLEX_GENERIC_KERNEL)
-!           use complex_generic_kernel, only : single_hh_trafo_complex_generic
-!#endif
-!#ifdef HAVE_DETAILED_TIMINGS
-!           use timings
-!#endif
-!           implicit none
-!           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
-!
-!           ! Private variables in OMP regions (my_thread) should better be in the argument list!
-!
-!           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
-!#ifdef WITH_OPENMP
-!           integer(kind=ik)             :: my_thread, noff
-!#endif
-!           real(kind=rk)                :: ttt
-!
-!           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-!           !        Currently (on Sandy Bridge), single is faster than double
-!           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-!
-!           complex(kind=ck)             :: w(nbw,2)
-!
-!#ifdef HAVE_DETAILED_TIMINGS
-!           call timer%start("compute_hh_trafo_complex")
-!#endif
-!
-!#ifdef WITH_OPENMP
-!           if (istripe<stripe_count) then
-!             nl = stripe_width
-!           else
-!             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
-!             nl = min(my_thread*thread_width-noff, l_nev-noff)
-!             if(nl<=0) return
-!           endif
-!#else
-!           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
-!#endif
-!
-!#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) then
-!#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!             ttt = mpi_wtime()
-!             do j = ncols, 2, -2
-!               w(:,1) = bcast_buffer(1:nbw,j+off)
-!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-!#ifdef WITH_OPENMP
-!               call double_hh_trafo_complex_sse_avx_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
-!                                                       w, nbw, nl, stripe_width, nbw)
-!#else
-!               call double_hh_trafo_complex_sse_avx_2hv(a(1,j+off+a_off-1,istripe), &
-!                                                       w, nbw, nl, stripe_width, nbw)
-!#endif
-!             enddo
-!#ifdef WITH_OPENMP
-!             if (j==1) call single_hh_trafo_complex_sse_avx_1hv(a(1,1+off+a_off,istripe,my_thread), &
-!                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-!#else
-!             if (j==1) call single_hh_trafo_complex_sse_avx_1hv(a(1,1+off+a_off,istripe), &
-!                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-!#endif
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           endif
-!#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL */
-!
-!
-!#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!             ttt = mpi_wtime()
-!             do j = ncols, 1, -1
-!#ifdef WITH_OPENMP
-!               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe,my_thread), &
-!                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#else
-!               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe), &
-!                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-!             enddo
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           endif
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
-!
-!
-!#if defined(WITH_COMPLEX_GENERIC_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
-!               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
-!               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!             ttt = mpi_wtime()
-!             do j = ncols, 1, -1
-!#ifdef WITH_OPENMP
-!              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe,my_thread), &
-!                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#else
-!              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe), &
-!                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-!            enddo
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!          endif
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_GENERIC_KERNEL */
-!
-!#if defined(WITH_COMPLEX_SSE_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!             ttt = mpi_wtime()
-!             do j = ncols, 1, -1
-!#ifdef WITH_OPENMP
-!              call single_hh_trafo_complex(a(1,j+off+a_off,istripe,my_thread), &
-!                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#else
-!              call single_hh_trafo_complex(a(1,j+off+a_off,istripe), &
-!                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-!            enddo
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!          endif
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_SSE_KERNEL */
-!
-!
-!!#if defined(WITH_AVX_SANDYBRIDGE)
-!!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!!#endif
-!
-!!#if defined(WITH_AMD_BULLDOZER)
-!!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!!#endif
-!
-!#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!            ttt = mpi_wtime()
-!            do j = ncols, 1, -1
-!#ifdef WITH_OPENMP
-!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe,my_thread), &
-!                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#else
-!              call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe), &
-!                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-!            enddo
-!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-!          endif
-!#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-!#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNE */
-!
-!#ifdef WITH_OPENMP
-!          if (my_thread==1) then
-!#endif
-!            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
-!            kernel_time  = kernel_time + mpi_wtime()-ttt
-!#ifdef WITH_OPENMP
-!          endif
-!#endif
-!#ifdef HAVE_DETAILED_TIMINGS
-!          call timer%stop("compute_hh_trafo_complex")
-!#endif
-!
-!
-!        end subroutine compute_hh_trafo_complex
-
-    end subroutine trans_ev_tridi_to_band_complex
-
-#define DATATYPE REAL(kind=rk)
-#define BYTESIZE 8
-#define REALCASE 1
-#include "redist_band.X90"
-#undef DATATYPE
-#undef BYTESIZE
-#undef REALCASE
-
-#define DATATYPE COMPLEX(kind=ck)
-#define BYTESIZE 16
-#define COMPLEXCASE 1
-#include "redist_band.X90"
-#undef DATATYPE
-#undef BYTESIZE
-#undef COMPLEXCASE
-
-    !---------------------------------------------------------------------------------------------------
-    ! divide_band: sets the work distribution in band
-    ! Proc n works on blocks block_limits(n)+1 .. block_limits(n+1)
-
-    subroutine divide_band(nblocks_total, n_pes, block_limits)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in)  :: nblocks_total ! total number of blocks in band
-      integer(kind=ik), intent(in)  :: n_pes         ! number of PEs for division
-      integer(kind=ik), intent(out) :: block_limits(0:n_pes)
-
-      integer(kind=ik)              :: n, nblocks, nblocks_left
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("divide_band")
-#endif
-
-      block_limits(0) = 0
-      if (nblocks_total < n_pes) then
-        ! Not enough work for all: The first tasks get exactly 1 block
-        do n=1,n_pes
-          block_limits(n) = min(nblocks_total,n)
-        enddo
-      else
-        ! Enough work for all. If there is no exact loadbalance,
-        ! the LAST tasks get more work since they are finishing earlier!
-        nblocks = nblocks_total/n_pes
-        nblocks_left = nblocks_total - n_pes*nblocks
-        do n=1,n_pes
-          if (n<=n_pes-nblocks_left) then
-            block_limits(n) = block_limits(n-1) + nblocks
-          else
-            block_limits(n) = block_limits(n-1) + nblocks + 1
-          endif
-        enddo
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("divide_band")
-#endif
-
-    end subroutine
-
-    subroutine band_band_real(na, nb, nb2, ab, ab2, d, e, mpi_comm)
-
-    !-------------------------------------------------------------------------------
-    ! band_band_real:
-    ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored.
-    ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with
-    ! zero entries, if this
-    ! requirement doesn't hold)
-    !
-    !  na          Order of matrix
-    !
-    !  nb          Semi bandwidth of original matrix
-    !
-    !  nb2         Semi bandwidth of target matrix
-    !
-    !  ab          Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout
-    !              has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb)
-    !              are located on rank n.
-    !
-    !  ab2         Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is
-    !              accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located
-    !              on rank n.
-    !
-    !  d(na)       Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
-    !
-    !  e(na)       Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output)
-    !
-    !  mpi_comm
-    !              MPI-Communicator for the total processor set
-    !-------------------------------------------------------------------------------
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)  :: na, nb, nb2, mpi_comm
-      real(kind=rk), intent(inout)  :: ab(2*nb,*)   ! remove assumed size
-      real(kind=rk), intent(inout)  :: ab2(2*nb2,*) ! remove assumed size
-      real(kind=rk), intent(out)    :: d(na), e(na) ! set only on PE 0
-
-      real(kind=rk)                 :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), &
-                                       tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2)
-
-      real(kind=rk)                 :: work(nb*nb2), work2(nb2*nb2)
-      integer(kind=ik)              :: lwork, info
-
-      integer(kind=ik)              :: istep, i, n, dest
-      integer(kind=ik)              :: n_off, na_s
-      integer(kind=ik)              :: my_pe, n_pes, mpierr
-      integer(kind=ik)              :: nblocks_total, nblocks
-      integer(kind=ik)              :: nblocks_total2, nblocks2
-      integer(kind=ik)              :: ireq_ab, ireq_hv
-#ifdef WITH_MPI
-      integer(kind=ik)              :: mpi_status(MPI_STATUS_SIZE)
-#endif
-      integer(kind=ik), allocatable :: mpi_statuses(:,:)
-      integer(kind=ik), allocatable :: block_limits(:), block_limits2(:), ireq_ab2(:)
-
-      integer(kind=ik)              :: j, nc, nr, ns, ne, iblk
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("band_band_real")
-#endif
-      call mpi_comm_rank(mpi_comm,my_pe,mpierr)
-      call mpi_comm_size(mpi_comm,n_pes,mpierr)
-      ! Total number of blocks in the band:
-      nblocks_total = (na-1)/nb + 1
-      nblocks_total2 = (na-1)/nb2 + 1
-
-      ! Set work distribution
-      allocate(block_limits(0:n_pes))
-      call divide_band(nblocks_total, n_pes, block_limits)
-
-      allocate(block_limits2(0:n_pes))
-      call divide_band(nblocks_total2, n_pes, block_limits2)
-
-      ! nblocks: the number of blocks for my task
-      nblocks = block_limits(my_pe+1) - block_limits(my_pe)
-      nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe)
-
-      allocate(ireq_ab2(1:nblocks2))
-#ifdef WITH_MPI
-      ireq_ab2 = MPI_REQUEST_NULL
-      if (nb2>1) then
-        do i=0,nblocks2-1
-          call mpi_irecv(ab2(1,i*nb2+1),2*nb2*nb2,mpi_real8,0,3,mpi_comm,ireq_ab2(i+1),mpierr)
-        enddo
-      endif
-#else
-      ! carefull the "recieve" has to be done at the corresponding send or wait
-!      if (nb2>1) then
-!        do i=0,nblocks2-1
-!          ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2)
-!        enddo
-!      endif
-
-#endif
-      ! n_off: Offset of ab within band
-      n_off = block_limits(my_pe)*nb
-      lwork = nb*nb2
-      dest = 0
-#ifdef WITH_MPI
-      ireq_ab = MPI_REQUEST_NULL
-      ireq_hv = MPI_REQUEST_NULL
-#endif
-      ! ---------------------------------------------------------------------------
-      ! Start of calculations
-
-      na_s = block_limits(my_pe)*nb + 1
-
-      if (my_pe>0 .and. na_s<=na) then
-        ! send first nb2 columns to previous PE
-        ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
-        do i=1,nb2
-          ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
-        enddo
-#ifdef WITH_MPI
-        call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-#endif
-      endif
-
-      do istep=1,na/nb2
-
-        if (my_pe==0) then
-
-          n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
-          hv(:,:) = 0
-          tau(:) = 0
-
-          ! The last step (istep=na-1) is only needed for sending the last HH vectors.
-          ! We don't want the sign of the last element flipped (analogous to the other sweeps)
-          if (istep < na/nb2) then
-
-            ! Transform first block column of remaining matrix
-            call dgeqrf(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info);
-
-            do i=1,nb2
-              hv(i,i) = 1.0
-              hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
-              ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0
-            enddo
-
-          endif
-
-          if (nb2==1) then
-            d(istep) = ab(1,na_s-n_off)
-            e(istep) = ab(2,na_s-n_off)
-            if (istep == na) then
-              e(na) = 0
-            endif
-          else
-            ab_s2 = 0
-            ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
-            if (block_limits2(dest+1)<istep) then
-              dest = dest+1
-            endif
-#ifdef WITH_MPI
-            call mpi_send(ab_s2,2*nb2*nb2,mpi_real8,dest,3,mpi_comm,mpierr)
-#else
-            ! do irecv here
-            if (nb2>1) then
-              do i= 0,nblocks2-1
-                ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2)
-              enddo
-            endif
-#endif
-          endif
-
-        else
-          if (na>na_s+nb2-1) then
-            ! Receive Householder vectors from previous task, from PE owning subdiagonal
-#ifdef WITH_MPI
-            call mpi_recv(hv,nb*nb2,mpi_real8,my_pe-1,2,mpi_comm,mpi_status,mpierr)
-#else
-            hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
-#endif
-            do i=1,nb2
-              tau(i) = hv(i,i)
-              hv(i,i) = 1.
-            enddo
-          endif
-        endif
-
-        na_s = na_s+nb2
-        if (na_s-n_off > nb) then
-          ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
-          ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
-          n_off = n_off + nb
-        endif
-
-        do iblk=1,nblocks
-         ns = na_s + (iblk-1)*nb - n_off ! first column in block
-         ne = ns+nb-nb2                    ! last column in block
-
-         if (ns+n_off>na) exit
-
-           nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
-           nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
-                                         ! Note that nr>=0 implies that diagonal block is full (nc==nb)!
-
-           call wy_gen(nc,nb2,w,hv,tau,work,nb)
-
-           if (iblk==nblocks .and. nc==nb) then
-             !request last nb2 columns
-#ifdef WITH_MPI
-             call mpi_recv(ab_r,(nb+1)*nb2,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr)
-#else
-             ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
-#endif
-             do i=1,nb2
-	       ab(1:nb+1,ne+i-1) = ab_r(:,i)
-             enddo
-           endif
-
-           hv_new(:,:) = 0 ! Needed, last rows must be 0 for nr < nb
-           tau_new(:) = 0
-
-           if (nr>0) then
-             call wy_right(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
-
-             call dgeqrf(nr,nb2,ab(nb+1,ns),2*nb-1,tau_new,work,lwork,info);
-
-             do i=1,nb2
-	       hv_new(i,i) = 1.0
-	       hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
-	       ab(nb+2:,ns+i-1) = 0
-	     enddo
-
-	     !send hh-vector
-	     if (iblk==nblocks) then
-#ifdef WITH_MPI
-               call mpi_wait(ireq_hv,mpi_status,mpierr)
-#endif
-	       hv_s = hv_new
-	       do i=1,nb2
-	         hv_s(i,i) = tau_new(i)
-               enddo
-#ifdef WITH_MPI
-               call mpi_isend(hv_s,nb*nb2,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
-#endif
-             endif
-
-           endif
-
-	   call wy_symm(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
-
-           if (my_pe>0 .and. iblk==1) then
-	     !send first nb2 columns to previous PE
-#ifdef WITH_MPI
-	     call mpi_wait(ireq_ab,mpi_status,mpierr)
-#endif
-	     do i=1,nb2
-	       ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
-	     enddo
-#ifdef WITH_MPI
-	     call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
-#endif
-           endif
-
-           if (nr>0) then
-             call wy_gen(nr,nb2,w_new,hv_new,tau_new,work,nb)
-	     call wy_left(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
-           endif
-
-           ! Use new HH vector for the next block
-	   hv(:,:) = hv_new(:,:)
-           tau = tau_new
-         enddo
-       enddo
-
-       ! Finish the last outstanding requests
-#ifdef WITH_MPI
-       call mpi_wait(ireq_ab,mpi_status,mpierr)
-       call mpi_wait(ireq_hv,mpi_status,mpierr)
-       allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2))
-       call mpi_waitall(nblocks2,ireq_ab2,mpi_statuses,mpierr)
-       deallocate(mpi_statuses)
-
-       call mpi_barrier(mpi_comm,mpierr)
-#endif
-       deallocate(block_limits)
-       deallocate(block_limits2)
-       deallocate(ireq_ab2)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("band_band_real")
-#endif
-
-    end subroutine
-
-    subroutine wy_gen(n, nb, W, Y, tau, mem, lda)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: n		!length of householder-vectors
-      integer(kind=ik), intent(in) :: nb		!number of householder-vectors
-      integer(kind=ik), intent(in) :: lda		!leading dimension of Y and W
-      real(kind=rk), intent(in)    :: Y(lda,nb)	!matrix containing nb householder-vectors of length b
-      real(kind=rk), intent(in)    :: tau(nb)	!tau values
-      real(kind=rk), intent(out)   :: W(lda,nb)	!output matrix W
-      real(kind=rk), intent(in)    :: mem(nb)	!memory for a temporary matrix of size nb
-
-      integer(kind=ik)             :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("wy_gen")
-#endif
-
-      W(1:n,1) = tau(1)*Y(1:n,1)
-      do i=2,nb
-        W(1:n,i) = tau(i)*Y(1:n,i)
-        call DGEMV('T',n,i-1,1.d0,Y,lda,W(1,i),1,0.d0,mem,1)
-        call DGEMV('N',n,i-1,-1.d0,W,lda,mem,1,1.d0,W(1,i),1)
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("wy_gen")
-#endif
-
-    end subroutine
-
-    subroutine wy_left(n, m, nb, A, lda, W, Y, mem, lda2)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: n		!width of the matrix A
-      integer(kind=ik), intent(in) :: m		!length of matrix W and Y
-      integer(kind=ik), intent(in) :: nb		!width of matrix W and Y
-      integer(kind=ik), intent(in) :: lda		!leading dimension of A
-      integer(kind=ik), intent(in) :: lda2		!leading dimension of W and Y
-      real(kind=rk), intent(inout) :: A(lda,*)	!matrix to be transformed   ! remove assumed size
-      real(kind=rk), intent(in)    :: W(m,nb)	!blocked transformation matrix W
-      real(kind=rk), intent(in)    :: Y(m,nb)	!blocked transformation matrix Y
-      real(kind=rk), intent(inout) :: mem(n,nb)	!memory for a temporary matrix of size n x nb
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("wy_left")
-#endif
-
-      call DGEMM('T', 'N', nb, n, m, 1.d0, W, lda2, A, lda, 0.d0, mem, nb)
-      call DGEMM('N', 'N', m, n, nb, -1.d0, Y, lda2, mem, nb, 1.d0, A, lda)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("wy_left")
-#endif
-
-    end subroutine
-
-    subroutine wy_right(n, m, nb, A, lda, W, Y, mem, lda2)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: n		!height of the matrix A
-      integer(kind=ik), intent(in) :: m		!length of matrix W and Y
-      integer(kind=ik), intent(in) :: nb		!width of matrix W and Y
-      integer(kind=ik), intent(in) :: lda		!leading dimension of A
-      integer(kind=ik), intent(in) :: lda2		!leading dimension of W and Y
-      real(kind=rk), intent(inout) :: A(lda,*)	!matrix to be transformed  ! remove assumed size
-      real(kind=rk), intent(in)    :: W(m,nb)	!blocked transformation matrix W
-      real(kind=rk), intent(in)    :: Y(m,nb)	!blocked transformation matrix Y
-      real(kind=rk), intent(inout) :: mem(n,nb)	!memory for a temporary matrix of size n x nb
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("wy_right")
-#endif
-
-      call DGEMM('N', 'N', n, nb, m, 1.d0, A, lda, W, lda2, 0.d0, mem, n)
-      call DGEMM('N', 'T', n, m, nb, -1.d0, mem, n, Y, lda2, 1.d0, A, lda)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("wy_right")
-#endif
-
-    end subroutine
-
-    subroutine wy_symm(n, nb, A, lda, W, Y, mem, mem2, lda2)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: n		!width/heigth of the matrix A; length of matrix W and Y
-      integer(kind=ik), intent(in) :: nb		!width of matrix W and Y
-      integer(kind=ik), intent(in) :: lda		!leading dimension of A
-      integer(kind=ik), intent(in) :: lda2		!leading dimension of W and Y
-      real(kind=rk), intent(inout) :: A(lda,*)	!matrix to be transformed  ! remove assumed size
-      real(kind=rk), intent(in)    :: W(n,nb)	!blocked transformation matrix W
-      real(kind=rk), intent(in)    :: Y(n,nb)	!blocked transformation matrix Y
-      real(kind=rk)                :: mem(n,nb)	!memory for a temporary matrix of size n x nb
-      real(kind=rk)                :: mem2(nb,nb)	!memory for a temporary matrix of size nb x nb
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("wy_symm")
-#endif
-
-      call DSYMM('L', 'L', n, nb, 1.d0, A, lda, W, lda2, 0.d0, mem, n)
-      call DGEMM('T', 'N', nb, nb, n, 1.d0, mem, n, W, lda2, 0.d0, mem2, nb)
-      call DGEMM('N', 'N', n, nb, nb, -0.5d0, Y, lda2, mem2, nb, 1.d0, mem, n)
-      call DSYR2K('L', 'N', n, nb, -1.d0, Y, lda2, mem, n, 1.d0, A, lda)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("wy_symm")
-#endif
-    end subroutine
-end module ELPA2_compute
diff -Nru elpa-2016.05.001/src/elpa2.F90 elpa-2019.11.001/src/elpa2.F90
--- elpa-2016.05.001/src/elpa2.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,555 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!    This particular source code file contains additions, changes and
-!    enhancements authored by Intel Corporation which is not part of
-!    the ELPA consortium.
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-#include "config-f90.h"
-!> \brief Fortran module which provides the routines to use the two-stage ELPA solver
-module ELPA2
-
-! Version 1.1.2, 2011-02-21
-
-  use elpa_utilities
-  use elpa1_compute
-  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
-  use elpa2_utilities
-  use elpa2_compute
-  use elpa_pdgeqrf
-
-  use elpa_mpi
-  implicit none
-
-  PRIVATE ! By default, all routines contained are private
-
-  ! The following routines are public:
-
-  public :: solve_evp_real_2stage
-  public :: solve_evp_complex_2stage
-
-
-!******
-contains
-!-------------------------------------------------------------------------------
-!>  \brief solve_evp_real_2stage: Fortran function to solve the real eigenvalue problem with a 2 stage approach
-!>
-!>  Parameters
-!>
-!>  \param na                                   Order of matrix a
-!>
-!>  \param nev                                  Number of eigenvalues needed
-!>
-!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
-!>                                              Distribution is like in Scalapack.
-!>                                              The full matrix must be set (not only one half like in scalapack).
-!>                                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                                  Leading dimension of a
-!>
-!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
-!>                                              Distribution is like in Scalapack.
-!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                                  Leading dimension of q
-!>
-!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols                           local columns of matrix a and q
-!>
-!>  \param mpi_comm_rows                        MPI communicator for rows
-!>  \param mpi_comm_cols                        MPI communicator for columns
-!>  \param mpi_comm_all                         MPI communicator for the total processor set
-!>
-!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
-!>
-!>  \param use_qr (optional)                    use QR decomposition
-!>
-!>  \result success                             logical, false if error occured
-!-------------------------------------------------------------------------------
-
-function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
-                               matrixCols,                               &
-                                 mpi_comm_rows, mpi_comm_cols,           &
-                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
-                                 useQR) result(success)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   use precision
-   implicit none
-   logical, intent(in), optional          :: useQR
-   logical                                :: useQRActual, useQREnvironment
-   integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
-   integer(kind=ik)                       :: THIS_REAL_ELPA_KERNEL
-
-   integer(kind=ik), intent(in)           :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
-                                             mpi_comm_cols, mpi_comm_all
-   integer(kind=ik), intent(in)           :: nblk
-   real(kind=rk), intent(inout)           :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
-   ! was
-   ! real a(lda,*), q(ldq,*)
-   real(kind=rk), allocatable             :: hh_trans_real(:,:)
-
-   integer(kind=ik)                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
-   integer(kind=ik)                       :: nbw, num_blocks
-   real(kind=rk), allocatable             :: tmat(:,:,:), e(:)
-   real(kind=rk)                          :: ttt0, ttt1, ttts
-   integer(kind=ik)                       :: i
-   logical                                :: success
-   logical, save                          :: firstCall = .true.
-   logical                                :: wantDebug
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("solve_evp_real_2stage")
-#endif
-   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
-   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-
-   wantDebug = .false.
-   if (firstCall) then
-     ! are debug messages desired?
-     wantDebug = debug_messages_via_environment_variable()
-     firstCall = .false.
-   endif
-
-   success = .true.
-
-   useQRActual = .false.
-
-   ! set usage of qr decomposition via API call
-   if (present(useQR)) then
-     if (useQR) useQRActual = .true.
-     if (.not.(useQR)) useQRACtual = .false.
-   endif
-
-   ! overwrite this with environment variable settings
-   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
-     useQRActual = useQREnvironment
-   endif
-
-   if (useQRActual) then
-     if (mod(na,2) .ne. 0) then
-       if (wantDebug) then
-         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
-       endif
-     print *, "Do not use QR-decomposition for this matrix and blocksize."
-     success = .false.
-     return
-     endif
-   endif
-
-
-   if (present(THIS_REAL_ELPA_KERNEL_API)) then
-     ! user defined kernel via the optional argument in the API call
-     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
-   else
-
-     ! if kernel is not choosen via api
-     ! check whether set by environment variable
-     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
-   endif
-
-   ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
-   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
-
-     if (my_pe == 0) then
-       write(error_unit,*) " "
-       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
-       write(error_unit,*) "is not in the list of the allowed kernels!"
-       write(error_unit,*) " "
-       write(error_unit,*) "Allowed kernels are:"
-       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
-         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
-           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
-         endif
-       enddo
-
-       write(error_unit,*) " "
-       ! check whether generic kernel is defined
-       if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
-         write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !"
-       else
-         write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used"
-       endif
-     endif  ! my_pe == 0
-     if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then
-       THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
-     else
-       THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
-     endif
-   endif
-
-   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
-   ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
-   ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
-   ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
-   ! on this and maybe allow a run-time optimization here
-   nbw = (63/nblk+1)*nblk
-
-   num_blocks = (na-1)/nbw + 1
-
-   allocate(tmat(nbw,nbw,num_blocks))
-
-   ! Reduction full -> band
-
-   ttt0 = MPI_Wtime()
-   ttts = ttt0
-   call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
-                     tmat, wantDebug, success, useQRActual)
-   if (.not.(success)) return
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
-
-   ! Reduction band -> tridiagonal
-
-   allocate(e(na))
-
-   ttt0 = MPI_Wtime()
-   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
-                          mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
-#ifdef WITH_MPI
-   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
-   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
-#endif
-   ttt1 = MPI_Wtime()
-   time_evp_fwd = ttt1-ttts
-
-   ! Solve tridiagonal system
-
-   ttt0 = MPI_Wtime()
-   call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows,  &
-                    mpi_comm_cols, wantDebug, success)
-   if (.not.(success)) return
-
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
-   time_evp_solve = ttt1-ttt0
-   ttts = ttt1
-
-   deallocate(e)
-
-   ! Backtransform stage 1
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
-                                    mpi_comm_rows, mpi_comm_cols, wantDebug, success,      &
-                                    THIS_REAL_ELPA_KERNEL)
-   if (.not.(success)) return
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
-
-   ! We can now deallocate the stored householder vectors
-   deallocate(hh_trans_real)
-
-   ! Backtransform stage 2
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
-                                   mpi_comm_cols, useQRActual)
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
-   time_evp_back = ttt1-ttts
-
-   deallocate(tmat)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("solve_evp_real_2stage")
-#endif
-1  format(a,f10.3)
-
-end function solve_evp_real_2stage
-
-
-!-------------------------------------------------------------------------------
-!>  \brief solve_evp_complex_2stage: Fortran function to solve the complex eigenvalue problem with a 2 stage approach
-!>
-!>  Parameters
-!>
-!>  \param na                                   Order of matrix a
-!>
-!>  \param nev                                  Number of eigenvalues needed
-!>
-!>  \param a(lda,matrixCols)                    Distributed matrix for which eigenvalues are to be computed.
-!>                                              Distribution is like in Scalapack.
-!>                                              The full matrix must be set (not only one half like in scalapack).
-!>                                              Destroyed on exit (upper and lower half).
-!>
-!>  \param lda                                  Leading dimension of a
-!>
-!>  \param ev(na)                               On output: eigenvalues of a, every processor gets the complete set
-!>
-!>  \param q(ldq,matrixCols)                    On output: Eigenvectors of a
-!>                                              Distribution is like in Scalapack.
-!>                                              Must be always dimensioned to the full size (corresponding to (na,na))
-!>                                              even if only a part of the eigenvalues is needed.
-!>
-!>  \param ldq                                  Leading dimension of q
-!>
-!>  \param nblk                                 blocksize of cyclic distribution, must be the same in both directions!
-!>
-!>  \param matrixCols                           local columns of matrix a and q
-!>
-!>  \param mpi_comm_rows                        MPI communicator for rows
-!>  \param mpi_comm_cols                        MPI communicator for columns
-!>  \param mpi_comm_all                         MPI communicator for the total processor set
-!>
-!>  \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
-!>
-!>  \result success                             logical, false if error occured
-!-------------------------------------------------------------------------------
-function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols,      &
-                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   use precision
-   implicit none
-   integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
-   integer(kind=ik)                       :: THIS_COMPLEX_ELPA_KERNEL
-   integer(kind=ik), intent(in)           :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
-   complex(kind=ck), intent(inout)        :: a(lda,matrixCols), q(ldq,matrixCols)
-   ! was
-   ! complex a(lda,*), q(ldq,*)
-   real(kind=rk), intent(inout)           :: ev(na)
-   complex(kind=ck), allocatable          :: hh_trans_complex(:,:)
-
-   integer(kind=ik)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
-   integer(kind=ik)                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
-   complex(kind=ck), allocatable          :: tmat(:,:,:)
-   real(kind=rk), allocatable             :: q_real(:,:), e(:)
-   real(kind=rk)                          :: ttt0, ttt1, ttts
-   integer(kind=ik)                       :: i
-
-   logical                                :: success, wantDebug
-   logical, save                          :: firstCall = .true.
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("solve_evp_complex_2stage")
-#endif
-   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
-   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-   wantDebug = .false.
-   if (firstCall) then
-     ! are debug messages desired?
-     wantDebug = debug_messages_via_environment_variable()
-     firstCall = .false.
-   endif
-
-
-   success = .true.
-
-   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
-     ! user defined kernel via the optional argument in the API call
-     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
-   else
-     ! if kernel is not choosen via api
-     ! check whether set by environment variable
-     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
-   endif
-
-   ! check whether choosen kernel is allowed
-   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
-
-     if (my_pe == 0) then
-       write(error_unit,*) " "
-       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
-       write(error_unit,*) "is not in the list of the allowed kernels!"
-       write(error_unit,*) " "
-       write(error_unit,*) "Allowed kernels are:"
-       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
-         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
-           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
-         endif
-       enddo
-
-       write(error_unit,*) " "
-       ! check whether generic kernel is defined
-       if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
-         write(error_unit,*) "The default kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
-       else
-         write(error_unit,*) "As default kernel ",COMPLEX_ELPA_KERNEL_NAMES(DEFAULT_COMPLEX_ELPA_KERNEL)," will be used"
-       endif
-     endif  ! my_pe == 0
-     if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then
-       THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
-     else
-       THIS_COMPLEX_ELPA_KERNEL = DEFAULT_COMPLEX_ELPA_KERNEL
-     endif
-   endif
-   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
-
-   nbw = (31/nblk+1)*nblk
-
-   num_blocks = (na-1)/nbw + 1
-
-   allocate(tmat(nbw,nbw,num_blocks))
-
-   ! Reduction full -> band
-
-   ttt0 = MPI_Wtime()
-   ttts = ttt0
-   call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
-                        tmat, wantDebug, success)
-   if (.not.(success)) then
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop()
-#endif
-     return
-   endif
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
-
-   ! Reduction band -> tridiagonal
-
-   allocate(e(na))
-
-   ttt0 = MPI_Wtime()
-   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, &
-                             mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
-#ifdef WITH_MPI
-   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
-   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
-#endif
-   ttt1 = MPI_Wtime()
-   time_evp_fwd = ttt1-ttts
-
-   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
-   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
-   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
-
-   allocate(q_real(l_rows,l_cols))
-
-   ! Solve tridiagonal system
-
-   ttt0 = MPI_Wtime()
-   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
-                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
-   if (.not.(success)) return
-
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
-      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
-   time_evp_solve = ttt1-ttt0
-   ttts = ttt1
-
-   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
-
-   deallocate(e, q_real)
-
-   ! Backtransform stage 1
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,   &
-                                       matrixCols, hh_trans_complex, &
-                                       mpi_comm_rows, mpi_comm_cols, &
-                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
-   if (.not.(success)) return
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
-
-   ! We can now deallocate the stored householder vectors
-   deallocate(hh_trans_complex)
-
-   ! Backtransform stage 2
-
-   ttt0 = MPI_Wtime()
-   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, &
-                                      mpi_comm_rows, mpi_comm_cols)
-   ttt1 = MPI_Wtime()
-   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
-      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
-   time_evp_back = ttt1-ttts
-
-   deallocate(tmat)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("solve_evp_complex_2stage")
-#endif
-
-1  format(a,f10.3)
-
-end function solve_evp_complex_2stage
-
-end module ELPA2
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s	1970-01-01 00:00:00.000000000 +0000
@@ -1,765 +0,0 @@
-#    This file is part of ELPA.
-#
-#    The ELPA library was originally created by the ELPA consortium,
-#    consisting of the following organizations:
-#
-#    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-#      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-#    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-#      Informatik,
-#    - Technische Universität München, Lehrstuhl für Informatik mit
-#      Schwerpunkt Wissenschaftliches Rechnen ,
-#    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-#    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-#      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-#      and
-#    - IBM Deutschland GmbH
-#
-#
-#    More information can be found here:
-#    http://elpa.mpcdf.mpg.de/
-#
-#    ELPA is free software: you can redistribute it and/or modify
-#    it under the terms of the version 3 of the license of the
-#    GNU Lesser General Public License as published by the Free
-#    Software Foundation.
-#
-#    ELPA is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU Lesser General Public License for more details.
-#
-#    You should have received a copy of the GNU Lesser General Public License
-#    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-#
-#    ELPA reflects a substantial effort on the part of the original
-#    ELPA consortium, and we ask you to respect the spirit of the
-#    license that we chose: i.e., please contribute any changes you
-#    may have back to the original ELPA library distribution, and keep
-#    any derivatives of ELPA under the same license that we chose for
-#    the original distribution, the GNU Lesser General Public License.
-#
-
-
-# --------------------------------------------------------------------------------------------------
-#
-# This file contains the compute intensive kernels for the Householder transformations,
-# coded in x86_64 assembler and using SSE2/SSE3 instructions.
-#
-# It must be assembled with GNU assembler (just "as" on most Linux machines)
-#
-# Copyright of the original code rests with the authors inside the ELPA
-# consortium. The copyright of any additional modifications shall rest
-# with their original authors, but shall adhere to the licensing terms
-# distributed along with the original code in the file "COPYING".
-#
-# --------------------------------------------------------------------------------------------------
-        .globl double_hh_trafo
-        .globl single_hh_trafo_complex
-        .text
-
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-
-        .macro hh_trafo_real nrows
-
-        # When this macro is called, the following registers are set and must not be changed
-        # %rdi: Address of q
-        # %rsi: Address of hh
-        # %rdx: nb
-        # %rcx: Remaining rows nq
-        # %r8:  ldq in bytes
-        # %r9:  ldh in bytes
-        # %rax: address of hh at the end of the loops
-        # The top of the stack must contain the dot product of the two Householder vectors
-
-        movq      %rdi, %r10   # Copy address of q
-        movq      %rsi, %r11   # Copy address of hh
-
-
-#   x1 = q(1,2)
-#   x2 = q(2,2)
-#
-#   y1 = q(1,1) + q(1,2)*hh(2,2)
-#   y2 = q(2,1) + q(2,2)*hh(2,2)
-
-        movaps      (%r10), %xmm6       # y1 = q(1,1)
-        movaps    16(%r10), %xmm7       # y2 = q(2,1)
-        .if \nrows>=8
-        movaps    32(%r10), %xmm8
-        movaps    48(%r10), %xmm9
-        .if \nrows==12
-        movaps    64(%r10), %xmm10
-        movaps    80(%r10), %xmm11
-        .endif
-        .endif
-
-        addq      %r8, %r10             # %r10 => q(.,2)
-        movddup   8(%r11,%r9), %xmm15   #  hh(2,2)
-
-        .macro mac_pre_loop1 qoff, X, Y
-        movaps    \qoff(%r10), \X       # xn = q(n,2)
-        movaps    \X, %xmm12
-        mulpd     %xmm15, %xmm12
-        addpd     %xmm12, \Y            # yn = yn + xn*h(2,2)
-        .endm
-
-        mac_pre_loop1  0, %xmm0, %xmm6
-        mac_pre_loop1 16, %xmm1, %xmm7
-        .if \nrows>=8
-        mac_pre_loop1 32, %xmm2, %xmm8
-        mac_pre_loop1 48, %xmm3, %xmm9
-        .if \nrows==12
-        mac_pre_loop1 64, %xmm4, %xmm10
-        mac_pre_loop1 80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_pre_loop1
-
-#   do i=3,nb
-#      h1 = hh(i-1,1)
-#      h2 = hh(i,2)
-#      x1 = x1 + q(1,i)*h1
-#      y1 = y1 + q(1,i)*h2
-#      x2 = x2 + q(2,i)*h1
-#      y2 = y2 + q(2,i)*h2
-#      ...
-#   enddo
-
-        addq      $8, %r11
-        .align 16
-1:
-        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
-        jge       2f
-
-        addq      %r8, %r10             # %r10 => q(.,i)
-
-        movddup   (%r11), %xmm14        # hh(i-1,1)
-        movddup   8(%r11,%r9), %xmm15   # hh(i,2)
-
-        .macro mac_loop1 qoff, X, Y
-        movaps    \qoff(%r10), %xmm13   # q(.,i)
-        movaps    %xmm13, %xmm12
-        mulpd     %xmm14, %xmm13
-        addpd     %xmm13, \X            # xn = xn + q(.,i)*h1
-        mulpd     %xmm15, %xmm12
-        addpd     %xmm12, \Y            # yn = yn + q(.,i)*h2
-        .endm
-
-        mac_loop1  0, %xmm0, %xmm6
-        mac_loop1 16, %xmm1, %xmm7
-        .if \nrows>=8
-        mac_loop1 32, %xmm2, %xmm8
-        mac_loop1 48, %xmm3, %xmm9
-        .if \nrows==12
-        mac_loop1 64, %xmm4, %xmm10
-        mac_loop1 80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_loop1
-
-        addq      $8, %r11
-        jmp       1b
-2:
-
-#   x1 = x1 + q(1,nb+1)*hh(nb,1)
-#   x2 = x2 + q(2,nb+1)*hh(nb,1)
-
-        addq      %r8, %r10             # %r10 => q(.,nb+1)
-        movddup   (%r11), %xmm14
-
-        .macro mac_post_loop1 qoff, X
-        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
-        mulpd     %xmm14, %xmm13
-        addpd     %xmm13, \X
-        .endm
-
-        mac_post_loop1  0, %xmm0
-        mac_post_loop1 16, %xmm1
-        .if \nrows>=8
-        mac_post_loop1 32, %xmm2
-        mac_post_loop1 48, %xmm3
-        .if \nrows==12
-        mac_post_loop1 64, %xmm4
-        mac_post_loop1 80, %xmm5
-        .endif
-        .endif
-        .purgem   mac_post_loop1
-
-#   tau1 = hh(1,1)
-#   tau2 = hh(1,2)
-#
-#   h1 = -tau1
-#   x1 = x1*h1
-#   x2 = x2*h1
-
-        movq      %rsi, %r11    # restore %r11 (hh(1,1))
-
-        movddup (%r11), %xmm12 # hh(1,1)
-        xorps   %xmm14, %xmm14
-        subpd   %xmm12, %xmm14 # %xmm14 = -hh(1,1)
-
-        mulpd   %xmm14, %xmm0
-        mulpd   %xmm14, %xmm1
-        .if \nrows>=8
-        mulpd   %xmm14, %xmm2
-        mulpd   %xmm14, %xmm3
-        .if \nrows==12
-        mulpd   %xmm14, %xmm4
-        mulpd   %xmm14, %xmm5
-        .endif
-        .endif
-
-#   h1 = -tau2
-#   h2 = -tau2*s
-#   y1 = y1*h1 + x1*h2
-#   y2 = y2*h1 + x2*h2
-
-        movddup (%r11,%r9), %xmm12  # hh(1,2)
-        xorps   %xmm15, %xmm15
-        subpd   %xmm12, %xmm15 # %xmm15 = -hh(1,2) = h1
-        movaps  %xmm15, %xmm14
-        movddup (%rsp), %xmm12 # Get s from top of stack
-        mulpd   %xmm12, %xmm14 # %xmm14 = h2
-
-        .macro mac_xform_y X, Y
-        mulpd   %xmm15, \Y  # y1 = y1*h1
-        movaps  \X, %xmm12
-        mulpd   %xmm14, %xmm12
-        addpd   %xmm12, \Y
-        .endm
-
-        mac_xform_y %xmm0, %xmm6
-        mac_xform_y %xmm1, %xmm7
-        .if \nrows>=8
-        mac_xform_y %xmm2, %xmm8
-        mac_xform_y %xmm3, %xmm9
-        .if \nrows==12
-        mac_xform_y %xmm4, %xmm10
-        mac_xform_y %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_xform_y
-
-#   q(1,1) = q(1,1) + y1
-#   q(2,1) = q(2,1) + y2
-
-        movq   %rdi, %r10   # restore original Q
-
-        .macro mac_pre_loop2_1 qoff, Y
-        movaps    \qoff(%r10), %xmm13   # q(.,1)
-        addpd     \Y, %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_pre_loop2_1  0, %xmm6
-        mac_pre_loop2_1 16, %xmm7
-        .if \nrows>=8
-        mac_pre_loop2_1 32, %xmm8
-        mac_pre_loop2_1 48, %xmm9
-        .if \nrows==12
-        mac_pre_loop2_1 64, %xmm10
-        mac_pre_loop2_1 80, %xmm11
-        .endif
-        .endif
-        .purgem   mac_pre_loop2_1
-
-#   q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-#   q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-
-        addq      %r8, %r10             # %r10 => q(.,2)
-
-        movddup   8(%r11,%r9), %xmm15   # hh(2,2)
-
-        .macro mac_pre_loop2_2 qoff, X, Y
-        movaps    \X, %xmm13
-        movaps    \Y, %xmm12
-        mulpd     %xmm15, %xmm12
-        addpd     %xmm12, %xmm13
-        addpd     \qoff(%r10), %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_pre_loop2_2  0, %xmm0, %xmm6
-        mac_pre_loop2_2 16, %xmm1, %xmm7
-        .if \nrows>=8
-        mac_pre_loop2_2 32, %xmm2, %xmm8
-        mac_pre_loop2_2 48, %xmm3, %xmm9
-        .if \nrows==12
-        mac_pre_loop2_2 64, %xmm4, %xmm10
-        mac_pre_loop2_2 80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_pre_loop2_2
-
-#   do i=3,nb
-#      h1 = hh(i-1,1)
-#      h2 = hh(i,2)
-#      q(1,i) = q(1,i) + x1*h1 + y1*h2
-#      q(2,i) = q(2,i) + x2*h1 + y2*h2
-#   enddo
-
-        addq      $8, %r11
-        .align 16
-1:
-        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
-        jge       2f
-
-        addq      %r8, %r10             # %r10 => q(.,i)
-
-        movddup   (%r11), %xmm14        # hh(i-1,1)
-        movddup   8(%r11,%r9), %xmm15   # hh(i,2)
-
-        .macro mac_loop2 qoff, X, Y
-        movaps    \X, %xmm13
-        mulpd     %xmm14, %xmm13
-        movaps    \Y, %xmm12
-        mulpd     %xmm15, %xmm12
-        addpd     %xmm12, %xmm13
-        addpd     \qoff(%r10), %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_loop2  0, %xmm0, %xmm6
-        mac_loop2 16, %xmm1, %xmm7
-        .if \nrows>=8
-        mac_loop2 32, %xmm2, %xmm8
-        mac_loop2 48, %xmm3, %xmm9
-        .if \nrows==12
-        mac_loop2 64, %xmm4, %xmm10
-        mac_loop2 80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_loop2
-
-        addq      $8, %r11
-        jmp       1b
-2:
-
-#   q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-#   q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-
-        addq      %r8, %r10             # %r10 => q(.,nb+1)
-        movddup   (%r11), %xmm14
-
-        .macro mac_post_loop2 qoff, X
-        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
-        mulpd     %xmm14, \X
-        addpd     \X, %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_post_loop2  0, %xmm0
-        mac_post_loop2 16, %xmm1
-        .if \nrows>=8
-        mac_post_loop2 32, %xmm2
-        mac_post_loop2 48, %xmm3
-        .if \nrows==12
-        mac_post_loop2 64, %xmm4
-        mac_post_loop2 80, %xmm5
-        .endif
-        .endif
-        .purgem   mac_post_loop2
-
-        .endm
-
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-# FORTRAN Interface:
-#
-# subroutine double_hh_trafo(q, hh, nb, nq, ldq, ldh)
-#
-#   integer, intent(in) :: nb, nq, ldq, ldh
-#   real*8, intent(inout) :: q(ldq,*)
-#   real*8, intent(in) :: hh(ldh,*)
-#
-# Parameter mapping to registers
-#   parameter 1: %rdi : q
-#   parameter 2: %rsi : hh
-#   parameter 3: %rdx : nb
-#   parameter 4: %rcx : nq
-#   parameter 5: %r8  : ldq
-#   parameter 6: %r9  : ldh
-#
-#-------------------------------------------------------------------------------
-
-#!f>#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
-#!f>  interface
-#!f>    subroutine double_hh_trafo(q, hh, nb, nq, ldq, ldh) bind(C,name="double_hh_trafo")
-#!f>      use, intrinsic :: iso_c_binding
-#!f>      integer(kind=c_int)  :: nb, nq, ldq, ldh
-#!f>      real(kind=c_double)  :: q(*)
-#!f>      real(kind=c_double)  :: hh(nb,6)
-#!f>    end subroutine
-#!f>  end interface
-#!f>#endif
-        .align    16,0x90
-double_hh_trafo:
-
-        # Get integer parameters into corresponding registers
-
-        movslq    (%rdx), %rdx # nb
-        movslq    (%rcx), %rcx # nq
-        movslq    (%r8),  %r8  # ldq
-        movslq    (%r9),  %r9  # ldh
-
-        # Get ldq in bytes
-        addq      %r8, %r8
-        addq      %r8, %r8
-        addq      %r8, %r8 # 8*ldq, i.e. ldq in bytes
-
-        # Get ldh in bytes
-        addq      %r9, %r9
-        addq      %r9, %r9
-        addq      %r9, %r9 # 8*ldh, i.e. ldh in bytes
-
-        # set %rax to the address of hh at the end of the loops,
-        # i.e. if %rdx >= %rax we must jump out of the loop.
-        # please note: %rax = 8*%rdx + %rsi - 8
-        movq %rdx, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rsi, %rax
-        subq $8, %rax
-
-#-----------------------------------------------------------
-        # Calculate the dot product of the two Householder vectors
-
-        # decrement stack pointer to make space for s
-        subq $8, %rsp
-
-#   Fortran code:
-#   s = hh(2,2)*1
-#   do i=3,nb
-#      s = s+hh(i,2)*hh(i-1,1)
-#   enddo
-
-        movq      %rsi, %r11   # Copy address of hh
-
-        movsd     8(%r11,%r9), %xmm0 #  hh(2,2)
-        addq      $8, %r11
-1:
-        cmpq %rax, %r11
-        jge       2f
-        movsd   (%r11), %xmm14       # hh(i-1,1)
-        movsd   8(%r11,%r9), %xmm15  # hh(i,2)
-        mulsd   %xmm14, %xmm15
-        addsd   %xmm15, %xmm0
-        addq      $8, %r11
-        jmp       1b
-2:
-        movsd   %xmm0, (%rsp)   # put s on top of stack
-#-----------------------------------------------------------
-
-rloop_s:
-        cmpq      $8, %rcx   # if %rcx <= 8 jump out of loop
-        jle       rloop_e
-        hh_trafo_real 12 # transform 12 rows
-        addq      $96, %rdi  # increment q start adress by 96 bytes (6 rows)
-        subq      $12, %rcx  # decrement nq
-        jmp       rloop_s
-rloop_e:
-
-        cmpq      $4, %rcx   # if %rcx <= 4 jump to test_2
-        jle       test_4
-        hh_trafo_real 8 # transform 8 rows
-        jmp       return1
-
-test_4:
-        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
-        jle       return1
-        hh_trafo_real 4 # transform 4 rows
-
-return1:
-        addq      $8, %rsp   # reset stack pointer
-        ret
-
-        .align    16,0x90
-
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-
-        .macro hh_trafo_complex nrows
-
-        # When this macro is called, the following registers are set and must not be changed
-        # %rdi: Address of q
-        # %rsi: Address of hh
-        # %rdx: nb
-        # %rcx: Remaining rows nq
-        # %r8:  ldq in bytes
-
-        movq      %rdi, %r10   # Copy address of q
-        movq      %rsi, %r11   # Copy address of hh
-
-        # set %rax to the address of hh at the end of the loops,
-        # i.e. if %rdx >= %rax we must jump out of the loop.
-        # please note: %rax = 16*%rdx + %rsi
-        movq %rdx, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rax, %rax
-        addq %rsi, %rax
-
-#   x1 = q(1,1); y1 = 0
-#   x2 = q(2,1); y2 = 0
-#   ...
-
-        movaps      (%r10), %xmm0
-        movaps    16(%r10), %xmm1
-        xorps     %xmm6, %xmm6
-        xorps     %xmm7, %xmm7
-        .if \nrows>=4
-        movaps    32(%r10), %xmm2
-        movaps    48(%r10), %xmm3
-        xorps     %xmm8, %xmm8
-        xorps     %xmm9, %xmm9
-        .if \nrows==6
-        movaps    64(%r10), %xmm4
-        movaps    80(%r10), %xmm5
-        xorps     %xmm10, %xmm10
-        xorps     %xmm11, %xmm11
-        .endif
-        .endif
-
-#   do i=2,nb
-#      h1 = conjg(hh(i))
-#      x1 = x1 + q(1,i)*h1
-#      x2 = x2 + q(2,i)*h1
-#      ...
-#   enddo
-
-        addq      $16, %r11  # %r11 => hh(2)
-        .align 16
-1:
-        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
-        jge 2f
-
-        addq      %r8, %r10       # %r10 => q(.,i)
-
-        movddup    (%r11), %xmm14 # real(hh(i))
-        movddup   8(%r11), %xmm15 # imag(hh(i))
-
-        .macro mac_loop1 qoff, X, Y
-        movaps    \qoff(%r10), %xmm13     # q(.,i)
-        movaps    %xmm13, %xmm12
-        mulpd     %xmm14, %xmm13          # q(.,i)*real(hh(i))
-        addpd     %xmm13, \X              # x1 = x1 + q(.,i)*real(hh(i))
-        mulpd     %xmm15, %xmm12          # q(.,i)*imag(hh(i))
-        addsubpd  %xmm12, \Y              # y1 = y1 -/+ q(.,i)*imag(hh(i))
-        .endm
-
-        mac_loop1   0, %xmm0, %xmm6
-        mac_loop1  16, %xmm1, %xmm7
-        .if \nrows>=4
-        mac_loop1  32, %xmm2, %xmm8
-        mac_loop1  48, %xmm3, %xmm9
-        .if \nrows==6
-        mac_loop1  64, %xmm4, %xmm10
-        mac_loop1  80, %xmm5, %xmm11
-        .endif
-        .endif
-
-        .purgem   mac_loop1
-
-        addq      $16, %r11                # %r11 => hh(i+1)
-        jmp       1b
-2:
-
-        # Now the content of the yn has to be swapped and added to xn
-        .macro mac_post_loop_1 X, Y
-        shufpd $1, \Y, \Y
-        addpd  \Y, \X
-        .endm
-
-        mac_post_loop_1  %xmm0, %xmm6
-        mac_post_loop_1  %xmm1, %xmm7
-        .if \nrows>=4
-        mac_post_loop_1  %xmm2, %xmm8
-        mac_post_loop_1  %xmm3, %xmm9
-        .if \nrows==6
-        mac_post_loop_1  %xmm4, %xmm10
-        mac_post_loop_1  %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_post_loop_1
-
-#   tau1 = hh(1)
-#
-#   h1 = -tau1
-#   x1 = x1*h1; y1 = x1 with halfes exchanged
-#   x2 = x2*h1; y2 = x2 with halfes exchanged
-#   ...
-
-        movq      %rsi, %r11      # restore address of hh
-
-        xorps     %xmm14, %xmm14
-        movddup    (%r11), %xmm12 # real(hh(1))
-        subpd     %xmm12, %xmm14  #-real(hh(1))
-        xorps     %xmm15, %xmm15
-        movddup   8(%r11), %xmm12 # imag(hh(1))
-        subpd     %xmm12, %xmm15  #-imag(hh(1))
-
-        .macro mac_xform X, Y
-        movaps    \X, %xmm12
-        shufpd    $1, \X, %xmm12
-        mulpd     %xmm15, %xmm12
-        mulpd     %xmm14, \X
-        addsubpd  %xmm12, \X
-        movaps    \X, \Y          # copy to y
-        shufpd    $1, \X, \Y      # exchange halfes
-        .endm
-
-        mac_xform %xmm0, %xmm6
-        mac_xform %xmm1, %xmm7
-        .if \nrows>=4
-        mac_xform %xmm2, %xmm8
-        mac_xform %xmm3, %xmm9
-        .if \nrows==6
-        mac_xform %xmm4, %xmm10
-        mac_xform %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem mac_xform
-
-#   q(1,1) = q(1,1) + x1
-#   q(2,1) = q(2,1) + x2
-#   ...
-
-        movq      %rdi, %r10      # restore address of q
-        .macro mac_pre_loop2 qoff, X
-        movaps    \qoff(%r10), %xmm13     # q(.,1)
-        addpd     \X, %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_pre_loop2   0, %xmm0
-        mac_pre_loop2  16, %xmm1
-        .if \nrows>=4
-        mac_pre_loop2  32, %xmm2
-        mac_pre_loop2  48, %xmm3
-        .if \nrows==6
-        mac_pre_loop2  64, %xmm4
-        mac_pre_loop2  80, %xmm5
-        .endif
-        .endif
-        .purgem mac_pre_loop2
-
-#   do i=2,nb
-#      h1 = hh(i)
-#      q(1,i) = q(1,i) + x1*h1
-#      q(2,i) = q(2,i) + x2*h1
-#      ...
-#   enddo
-
-        addq      $16, %r11
-        .align 16
-1:
-        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
-        jge 2f
-
-        addq      %r8, %r10       # %r10 => q(.,i)
-
-        movddup    (%r11), %xmm14 # real(hh(i))
-        movddup   8(%r11), %xmm15 # imag(hh(i))
-
-        .macro mac_loop2 qoff, X, Y
-        movaps    \X, %xmm13
-        mulpd     %xmm14, %xmm13
-        movaps    \Y, %xmm12
-        mulpd     %xmm15, %xmm12
-        addsubpd  %xmm12, %xmm13
-        addpd     \qoff(%r10), %xmm13
-        movaps    %xmm13, \qoff(%r10)
-        .endm
-
-        mac_loop2   0, %xmm0, %xmm6
-        mac_loop2  16, %xmm1, %xmm7
-        .if \nrows>=4
-        mac_loop2  32, %xmm2, %xmm8
-        mac_loop2  48, %xmm3, %xmm9
-        .if \nrows==6
-        mac_loop2  64, %xmm4, %xmm10
-        mac_loop2  80, %xmm5, %xmm11
-        .endif
-        .endif
-        .purgem   mac_loop2
-
-        addq      $16, %r11
-        jmp       1b
-2:
-        .endm
-
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-# FORTRAN Interface:
-#
-# subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
-#
-#   integer, intent(in) :: nb, nq, ldq
-#   complex*16, intent(inout) :: q(ldq,*)
-#   complex*16, intent(in) :: hh(*)
-#
-# Parameter mapping to registers
-#   parameter 1: %rdi : q
-#   parameter 2: %rsi : hh
-#   parameter 3: %rdx : nb
-#   parameter 4: %rcx : nq
-#   parameter 5: %r8  : ldq
-#
-#-------------------------------------------------------------------------------
-#!f>#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
-#!f>  interface
-#!f>    subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq) bind(C,name="single_hh_trafo_complex")
-#!f>      use, intrinsic :: iso_c_binding
-#!f>      integer(kind=c_int)  :: nb, nq, ldq
-#!f>      complex(kind=c_double)  :: q(*)
-#!f>      complex(kind=c_double)  :: hh(nb,2)
-#!f>    end subroutine
-#!f>  end interface
-#!f>#endif
-        .align    16,0x90
-single_hh_trafo_complex:
-
-        # Get integer parameters into corresponding registers
-
-        movslq    (%rdx), %rdx # nb
-        movslq    (%rcx), %rcx # nq
-        movslq    (%r8),  %r8  # ldq
-
-        # Get ldq in bytes
-        addq      %r8, %r8
-        addq      %r8, %r8
-        addq      %r8, %r8
-        addq      %r8, %r8 # 16*ldq, i.e. ldq in bytes
-
-cloop_s:
-        cmpq      $4, %rcx   # if %rcx <= 4 jump out of loop
-        jle       cloop_e
-        hh_trafo_complex 6 # transform 6 rows
-        addq      $96, %rdi  # increment q start adress by 96 bytes (6 rows)
-        subq      $6,  %rcx  # decrement nq
-        jmp       cloop_s
-cloop_e:
-
-        cmpq      $2, %rcx   # if %rcx <= 2 jump to test_2
-        jle       test_2
-        hh_trafo_complex 4 # transform 4 rows
-        jmp       return2
-
-test_2:
-        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
-        jle       return2
-        hh_trafo_complex 2 # transform 2 rows
-
-return2:
-        ret
-
-        .align    16,0x90
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-#-------------------------------------------------------------------------------
-
-# Declare that we do not need an executable stack here
-	.section	.note.GNU-stack,"",@progbits
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,558 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-#include "config-f90.h"
-
-#include <complex.h>
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline))
-
-#ifdef HAVE_AVX2
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
-#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
-#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
-#endif
-
-#endif
-
-//Forward declaration
-static  __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
-static  __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
-static  __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
-
-/*
-!f>#ifdef HAVE_AVX
-!f> interface
-!f>   subroutine single_hh_trafo_complex_avx_avx2_1hv(q, hh, pnb, pnq, pldq) bind(C, name="single_hh_trafo_complex_avx_avx2_1hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq
-!f>     complex(kind=c_double)     :: q(*)
-!f>     complex(kind=c_double)     :: hh(pnb,2)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void single_hh_trafo_complex_avx_avx2_1hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	//int ldh = *pldh;
-
-	for (i = 0; i < nq-8; i+=12)
-	{
-		hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq);
-	}
-	if (nq-i > 4)
-	{
-		hh_trafo_complex_kernel_8_AVX_1hv(&q[i], hh, nb, ldq);
-	}
-	else if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq);
-	}
-}
-
- static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m256d x1, x2, x3, x4, x5, x6;
-	__m256d q1, q2, q3, q4, q5, q6;
-	__m256d h1_real, h1_imag;
-	__m256d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[0]);
-	x2 = _mm256_load_pd(&q_dbl[4]);
-	x3 = _mm256_load_pd(&q_dbl[8]);
-	x4 = _mm256_load_pd(&q_dbl[12]);
-	x5 = _mm256_load_pd(&q_dbl[16]);
-	x6 = _mm256_load_pd(&q_dbl[20]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-		q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]);
-		q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-		tmp5 = _mm256_mul_pd(h1_imag, q5);
-#ifdef __ELPA_USE_FMA__
-		x5 = _mm256_add_pd(x5, _mm256_FMSUBADD_pd(h1_real, q5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
-#else
-		x5 = _mm256_add_pd(x5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
-#endif
-		tmp6 = _mm256_mul_pd(h1_imag, q6);
-#ifdef __ELPA_USE_FMA__
-		x6 = _mm256_add_pd(x6, _mm256_FMSUBADD_pd(h1_real, q6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
-#else
-		x6 = _mm256_add_pd(x6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#else
-	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#endif
-	tmp5 = _mm256_mul_pd(h1_imag, x5);
-#ifdef __ELPA_USE_FMA__
-	x5 = _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5));
-#else
-	x5 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5));
-#endif
-	tmp6 = _mm256_mul_pd(h1_imag, x6);
-#ifdef __ELPA_USE_FMA__
-	x6 = _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5));
-#else
-	x6 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-	q3 = _mm256_load_pd(&q_dbl[8]);
-	q4 = _mm256_load_pd(&q_dbl[12]);
-	q5 = _mm256_load_pd(&q_dbl[16]);
-	q6 = _mm256_load_pd(&q_dbl[20]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-	q3 = _mm256_add_pd(q3, x3);
-	q4 = _mm256_add_pd(q4, x4);
-	q5 = _mm256_add_pd(q5, x5);
-	q6 = _mm256_add_pd(q6, x6);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-	_mm256_store_pd(&q_dbl[8], q3);
-	_mm256_store_pd(&q_dbl[12], q4);
-	_mm256_store_pd(&q_dbl[16], q5);
-	_mm256_store_pd(&q_dbl[20], q6);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-		q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]);
-		q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-		tmp5 = _mm256_mul_pd(h1_imag, x5);
-#ifdef __ELPA_USE_FMA__
-		q5 = _mm256_add_pd(q5, _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
-#else
-		q5 = _mm256_add_pd(q5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
-#endif
-		tmp6 = _mm256_mul_pd(h1_imag, x6);
-#ifdef __ELPA_USE_FMA__
-		q6 = _mm256_add_pd(q6, _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
-#else
-		q6 = _mm256_add_pd(q6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+16], q5);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+20], q6);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m256d x1, x2, x3, x4;
-	__m256d q1, q2, q3, q4;
-	__m256d h1_real, h1_imag;
-	__m256d tmp1, tmp2, tmp3, tmp4;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[0]);
-	x2 = _mm256_load_pd(&q_dbl[4]);
-	x3 = _mm256_load_pd(&q_dbl[8]);
-	x4 = _mm256_load_pd(&q_dbl[12]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#else
-	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-	q3 = _mm256_load_pd(&q_dbl[8]);
-	q4 = _mm256_load_pd(&q_dbl[12]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-	q3 = _mm256_add_pd(q3, x3);
-	q4 = _mm256_add_pd(q4, x4);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-	_mm256_store_pd(&q_dbl[8], q3);
-	_mm256_store_pd(&q_dbl[12], q4);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m256d x1, x2;
-	__m256d q1, q2;
-	__m256d h1_real, h1_imag;
-	__m256d tmp1, tmp2;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[0]);
-	x2 = _mm256_load_pd(&q_dbl[4]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-	}
-}
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1393 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-#include "config-f90.h"
-
-#include <complex.h>
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline))
-
-#ifdef HAVE_AVX2
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
-#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
-#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
-#endif
-
-#endif
-
-//Forward declaration
-static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
-static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
-static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
-static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
-
-/*
-!f>#ifdef HAVE_AVX
-!f> interface
-!f>   subroutine double_hh_trafo_complex_avx_avx2_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_complex_avx_avx2_2hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
-!f>     complex(kind=c_double)     :: q(*)
-!f>     complex(kind=c_double)     :: hh(pnb,2)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void double_hh_trafo_complex_avx_avx2_2hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	double complex s = conj(hh[(ldh)+1])*1.0;
-	for (i = 2; i < nb; i++)
-	{
-		s += hh[i-1] * conj(hh[(i+ldh)]);
-	}
-
-#if 1
-	for (i = 0; i < nq-4; i+=8)
-	{
-		hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#else
-	for (i = 0; i < nq-4; i+=6)
-	{
-		hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	if (nq-i > 2)
-	{
-		hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#endif
-}
-
-static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m256d x1, x2, x3, x4;
-	__m256d y1, y2, y3, y4;
-	__m256d q1, q2, q3, q4;
-	__m256d h1_real, h1_imag, h2_real, h2_imag;
-	__m256d tmp1, tmp2, tmp3, tmp4;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
-	x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);
-	x4 = _mm256_load_pd(&q_dbl[(2*ldq)+12]);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm256_load_pd(&q_dbl[0]);
-	y2 = _mm256_load_pd(&q_dbl[4]);
-	y3 = _mm256_load_pd(&q_dbl[8]);
-	y4 = _mm256_load_pd(&q_dbl[12]);
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h2_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h2_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h2_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
-	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#else
-	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-	h2_real = _mm256_xor_pd(h2_real, sign);
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-
-	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
-	tmp2 = _mm256_broadcast_pd(&tmp_s_128);
-	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
-	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
-	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, y4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm256_FMADDSUB_pd(h1_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#else
-	y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
-#endif
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h2_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm256_add_pd(y4, _mm256_FMADDSUB_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-	q3 = _mm256_load_pd(&q_dbl[8]);
-	q4 = _mm256_load_pd(&q_dbl[12]);
-
-	q1 = _mm256_add_pd(q1, y1);
-	q2 = _mm256_add_pd(q2, y2);
-	q3 = _mm256_add_pd(q3, y3);
-	q4 = _mm256_add_pd(q4, y4);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-	_mm256_store_pd(&q_dbl[8], q3);
-	_mm256_store_pd(&q_dbl[12], q4);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
-	q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-	q3 = _mm256_add_pd(q3, x3);
-	q4 = _mm256_add_pd(q4, x4);
-
-	tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __FMA4_
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h2_imag, y4);
-#ifdef __ELPA_USE_FMA__
-	q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
-	_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);
-	_mm256_store_pd(&q_dbl[(ldq*2)+12], q4);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-		tmp4 = _mm256_mul_pd(h2_imag, y4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
-	}
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
-	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	tmp4 = _mm256_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#else
-	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
-}
-
-static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m256d x1, x2, x3;
-	__m256d y1, y2, y3;
-	__m256d q1, q2, q3;
-	__m256d h1_real, h1_imag, h2_real, h2_imag;
-	__m256d tmp1, tmp2, tmp3;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
-	x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm256_load_pd(&q_dbl[0]);
-	y2 = _mm256_load_pd(&q_dbl[4]);
-	y3 = _mm256_load_pd(&q_dbl[8]);
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h2_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-	h2_real = _mm256_xor_pd(h2_real, sign);
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-
-	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
-	tmp2 = _mm256_broadcast_pd(&tmp_s_128);
-	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
-	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
-	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#else
-	y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
-#endif
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-	q3 = _mm256_load_pd(&q_dbl[8]);
-
-	q1 = _mm256_add_pd(q1, y1);
-	q2 = _mm256_add_pd(q2, y2);
-	q3 = _mm256_add_pd(q3, y3);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-	_mm256_store_pd(&q_dbl[8], q3);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-	q3 = _mm256_add_pd(q3, x3);
-
-	tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __FMA4_
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
-	_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-		tmp3 = _mm256_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
-	}
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	tmp3 = _mm256_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#else
-	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
-}
-
-static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m256d x1, x2;
-	__m256d y1, y2;
-	__m256d q1, q2;
-	__m256d h1_real, h1_imag, h2_real, h2_imag;
-	__m256d tmp1, tmp2;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm256_load_pd(&q_dbl[0]);
-	y2 = _mm256_load_pd(&q_dbl[4]);
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-	h2_real = _mm256_xor_pd(h2_real, sign);
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-
-	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
-	tmp2 = _mm256_broadcast_pd(&tmp_s_128);
-	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
-	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
-	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#else
-	y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
-#endif
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-	q2 = _mm256_load_pd(&q_dbl[4]);
-
-	q1 = _mm256_add_pd(q1, y1);
-	q2 = _mm256_add_pd(q2, y2);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-	_mm256_store_pd(&q_dbl[4], q2);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]);
-
-	q1 = _mm256_add_pd(q1, x1);
-	q2 = _mm256_add_pd(q2, x2);
-
-	tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __FMA4_
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm256_store_pd(&q_dbl[(ldq*2)+4], q2);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-		tmp2 = _mm256_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
-	}
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	tmp2 = _mm256_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#else
-	q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
-}
-
-static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m256d x1;
-	__m256d y1;
-	__m256d q1;
-	__m256d h1_real, h1_imag, h2_real, h2_imag;
-	__m256d tmp1;
-	int i=0;
-
-	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm256_load_pd(&q_dbl[0]);
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm256_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm256_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-	}
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-
-	h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm256_xor_pd(h1_real, sign);
-	h1_imag = _mm256_xor_pd(h1_imag, sign);
-	h2_real = _mm256_xor_pd(h2_real, sign);
-	h2_imag = _mm256_xor_pd(h2_imag, sign);
-
-	__m128d tmp_s_128 = _mm_loadu_pd(s_dbl);
-	__m256d tmp2 = _mm256_broadcast_pd(&tmp_s_128);
-	tmp1 = _mm256_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
-	h2_real = _mm256_broadcast_sd(&s_dbl[0]);
-	h2_imag = _mm256_broadcast_sd(&s_dbl[1]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#else
-	y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
-#endif
-
-	tmp1 = _mm256_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	q1 = _mm256_load_pd(&q_dbl[0]);
-
-	q1 = _mm256_add_pd(q1, y1);
-
-	_mm256_store_pd(&q_dbl[0], q1);
-
-	h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]);
-
-	q1 = _mm256_add_pd(q1, x1);
-
-	tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(ldq*2)+0], q1);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
-
-		h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-		h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm256_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-	}
-	h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]);
-
-	tmp1 = _mm256_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#else
-	q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
-#endif
-
-	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-}
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex.F90 elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex.F90
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,888 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-! It should be compiled with the highest possible optimization level.
-!
-! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-module complex_generic_kernel
-
-  private
-  public single_hh_trafo_complex_generic
-contains
-  subroutine single_hh_trafo_complex_generic(q, hh, nb, nq, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, nq, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(1:ldq,1:nb)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-
-    integer(kind=ik)                :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: single_hh_trafo_complex_generic")
-#endif
-
-    ! Safety only:
-
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-
-    ! Do the Householder transformations
-
-    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
-
-    do i=1,nq-8,12
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq)
-#else
-       call hh_trafo_complex_kernel_12(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
-#endif
-    enddo
-
-    ! i > nq-8 now, i.e. at most 8 rows remain
-
-    if(nq-i+1 > 4) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq)
-#else
-       call hh_trafo_complex_kernel_8(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
-#endif
-    else if(nq-i+1 > 0) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq)
-#else
-       call hh_trafo_complex_kernel_4(q(i:ldq,1:nb),hh(1:nb), nb, ldq)
-#endif
-    endif
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: single_hh_trafo_complex_generic")
-#endif
-  end subroutine single_hh_trafo_complex_generic
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine double_hh_trafo_complex_generic(q, hh, nb, nq, ldq, ldh)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, nq, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(1:ldq,1:nb+1)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck)                :: s
-
-    integer(kind=ik)                :: i
-
-    ! Safety only:
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: double_hh_trafo_complex_generic")
-#endif
-
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-
-    ! Calculate dot product of the two Householder vectors
-
-    s = conjg(hh(2,2)*1)
-    do i=3,nb
-       s = s+(conjg(hh(i,2))*hh(i-1,1))
-    enddo
-
-    ! Do the Householder transformations
-
-    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
-
-    do i=1,nq,4
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
-#else
-       call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-    enddo
-
-    !do i=1,nq-8,12
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    !   call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s)
-#else
-    !   call hh_trafo_complex_kernel_12_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-    !enddo
-
-    ! i > nq-8 now, i.e. at most 8 rows remain
-
-    !if(nq-i+1 > 4) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    !   call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s)
-#else
-    !   call hh_trafo_complex_kernel_8_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-    !else if(nq-i+1 > 0) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    !   call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#else
-
-#endif
-    !endif
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: double_hh_trafo_complex_generic")
-#endif
-
-  end subroutine double_hh_trafo_complex_generic
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc
-    complex(kind=ck)                :: h1, tau1
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_12")
-#endif
-
-    x1 = q(1,1)
-    x2 = q(2,1)
-    x3 = q(3,1)
-    x4 = q(4,1)
-    x5 = q(5,1)
-    x6 = q(6,1)
-    x7 = q(7,1)
-    x8 = q(8,1)
-    x9 = q(9,1)
-    xa = q(10,1)
-    xb = q(11,1)
-    xc = q(12,1)
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = conjg(hh(i))
-       x1 = x1 + q(1,i)*h1
-       x2 = x2 + q(2,i)*h1
-       x3 = x3 + q(3,i)*h1
-       x4 = x4 + q(4,i)*h1
-       x5 = x5 + q(5,i)*h1
-       x6 = x6 + q(6,i)*h1
-       x7 = x7 + q(7,i)*h1
-       x8 = x8 + q(8,i)*h1
-       x9 = x9 + q(9,i)*h1
-       xa = xa + q(10,i)*h1
-       xb = xb + q(11,i)*h1
-       xc = xc + q(12,i)*h1
-    enddo
-
-    tau1 = hh(1)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-    x9 = x9*h1
-    xa = xa*h1
-    xb = xb*h1
-    xc = xc*h1
-
-    q(1,1) = q(1,1) + x1
-    q(2,1) = q(2,1) + x2
-    q(3,1) = q(3,1) + x3
-    q(4,1) = q(4,1) + x4
-    q(5,1) = q(5,1) + x5
-    q(6,1) = q(6,1) + x6
-    q(7,1) = q(7,1) + x7
-    q(8,1) = q(8,1) + x8
-    q(9,1) = q(9,1) + x9
-    q(10,1) = q(10,1) + xa
-    q(11,1) = q(11,1) + xb
-    q(12,1) = q(12,1) + xc
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = hh(i)
-       q(1,i) = q(1,i) + x1*h1
-       q(2,i) = q(2,i) + x2*h1
-       q(3,i) = q(3,i) + x3*h1
-       q(4,i) = q(4,i) + x4*h1
-       q(5,i) = q(5,i) + x5*h1
-       q(6,i) = q(6,i) + x6*h1
-       q(7,i) = q(7,i) + x7*h1
-       q(8,i) = q(8,i) + x8*h1
-       q(9,i) = q(9,i) + x9*h1
-       q(10,i) = q(10,i) + xa*h1
-       q(11,i) = q(11,i) + xb*h1
-       q(12,i) = q(12,i) + xc*h1
-    enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_12")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_12
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6, x7, x8
-    complex(kind=ck)                :: h1, tau1
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_8")
-#endif
-
-    x1 = q(1,1)
-    x2 = q(2,1)
-    x3 = q(3,1)
-    x4 = q(4,1)
-    x5 = q(5,1)
-    x6 = q(6,1)
-    x7 = q(7,1)
-    x8 = q(8,1)
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = conjg(hh(i))
-       x1 = x1 + q(1,i)*h1
-       x2 = x2 + q(2,i)*h1
-       x3 = x3 + q(3,i)*h1
-       x4 = x4 + q(4,i)*h1
-       x5 = x5 + q(5,i)*h1
-       x6 = x6 + q(6,i)*h1
-       x7 = x7 + q(7,i)*h1
-       x8 = x8 + q(8,i)*h1
-    enddo
-
-    tau1 = hh(1)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-
-    q(1,1) = q(1,1) + x1
-    q(2,1) = q(2,1) + x2
-    q(3,1) = q(3,1) + x3
-    q(4,1) = q(4,1) + x4
-    q(5,1) = q(5,1) + x5
-    q(6,1) = q(6,1) + x6
-    q(7,1) = q(7,1) + x7
-    q(8,1) = q(8,1) + x8
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = hh(i)
-       q(1,i) = q(1,i) + x1*h1
-       q(2,i) = q(2,i) + x2*h1
-       q(3,i) = q(3,i) + x3*h1
-       q(4,i) = q(4,i) + x4*h1
-       q(5,i) = q(5,i) + x5*h1
-       q(6,i) = q(6,i) + x6*h1
-       q(7,i) = q(7,i) + x7*h1
-       q(8,i) = q(8,i) + x8*h1
-    enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_8")
-#endif
-  end subroutine hh_trafo_complex_kernel_8
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-    complex(kind=ck)                :: x1, x2, x3, x4
-    complex(kind=ck)                :: h1, tau1
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_4")
-#endif
-    x1 = q(1,1)
-    x2 = q(2,1)
-    x3 = q(3,1)
-    x4 = q(4,1)
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = conjg(hh(i))
-       x1 = x1 + q(1,i)*h1
-       x2 = x2 + q(2,i)*h1
-       x3 = x3 + q(3,i)*h1
-       x4 = x4 + q(4,i)*h1
-    enddo
-
-    tau1 = hh(1)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-
-    q(1,1) = q(1,1) + x1
-    q(2,1) = q(2,1) + x2
-    q(3,1) = q(3,1) + x3
-    q(4,1) = q(4,1) + x4
-
-    !DEC$ VECTOR ALIGNED
-    do i=2,nb
-       h1 = hh(i)
-       q(1,i) = q(1,i) + x1*h1
-       q(2,i) = q(2,i) + x2*h1
-       q(3,i) = q(3,i) + x3*h1
-       q(4,i) = q(4,i) + x4*h1
-    enddo
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_4")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_4
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck), intent(in)    :: s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, y1, y2, y3, y4
-    complex(kind=ck)                :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_4_2hv")
-#endif
-    x1 = q(1,2)
-    x2 = q(2,2)
-    x3 = q(3,2)
-    x4 = q(4,2)
-
-    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
-    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
-    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
-    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = conjg(hh(i-1,1))
-       h2 = conjg(hh(i,2))
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-    enddo
-
-    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
-    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
-    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
-    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_4_2hv")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_4_2hv
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck), intent(in)    :: s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6 ,x7, x8, y1, y2, y3, y4, y5, y6, y7, y8
-    complex(kind=ck)                :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_8_2hv")
-#endif
-
-    x1 = q(1,2)
-    x2 = q(2,2)
-    x3 = q(3,2)
-    x4 = q(4,2)
-    x5 = q(5,2)
-    x6 = q(6,2)
-    x7 = q(7,2)
-    x8 = q(8,2)
-
-    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
-    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
-    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
-    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
-    y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
-    y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
-    y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
-    y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = conjg(hh(i-1,1))
-       h2 = conjg(hh(i,2))
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-       x5 = x5 + q(5,i)*h1
-       y5 = y5 + q(5,i)*h2
-       x6 = x6 + q(6,i)*h1
-       y6 = y6 + q(6,i)*h2
-       x7 = x7 + q(7,i)*h1
-       y7 = y7 + q(7,i)*h2
-       x8 = x8 + q(8,i)*h1
-       y8 = y8 + q(8,i)*h2
-    enddo
-
-    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
-    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
-    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
-    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
-    x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
-    x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
-    x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
-    x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-    y5 = y5*h1 + x5*h2
-    y6 = y6*h1 + x6*h2
-    y7 = y7*h1 + x7*h2
-    y8 = y8*h1 + x8*h2
-
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-    q(5,1) = q(5,1) + y5
-    q(6,1) = q(6,1) + y6
-    q(7,1) = q(7,1) + y7
-    q(8,1) = q(8,1) + y8
-
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
-    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
-    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
-    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-       q(5,i) = q(5,i) + x5*h1 + y5*h2
-       q(6,i) = q(6,i) + x6*h1 + y6*h2
-       q(7,i) = q(7,i) + x7*h1 + y7*h2
-       q(8,i) = q(8,i) + x8*h1 + y8*h2
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
-    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
-    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
-    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_8_2hv")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_8_2hv
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(:,:)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck), intent(in)    :: s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6 ,x7, x8, x9, x10, x11, x12, y1, y2, y3, y4, y5, y6, &
-                                       y7, y8, y9, y10, y11, y12
-    complex(kind=ck)                :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("kernel generic: hh_trafo_complex_kernel_12_2hv")
-#endif
-    x1 = q(1,2)
-    x2 = q(2,2)
-    x3 = q(3,2)
-    x4 = q(4,2)
-    x5 = q(5,2)
-    x6 = q(6,2)
-    x7 = q(7,2)
-    x8 = q(8,2)
-    x9 = q(9,2)
-    x10 = q(10,2)
-    x11 = q(11,2)
-    x12 = q(12,2)
-
-    y1 = q(1,1) + q(1,2)*conjg(hh(2,2))
-    y2 = q(2,1) + q(2,2)*conjg(hh(2,2))
-    y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
-    y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
-    y5 = q(5,1) + q(5,2)*conjg(hh(2,2))
-    y6 = q(6,1) + q(6,2)*conjg(hh(2,2))
-    y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
-    y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
-    y9 = q(9,1) + q(9,2)*conjg(hh(2,2))
-    y10 = q(10,1) + q(10,2)*conjg(hh(2,2))
-    y11 = q(11,1) + q(11,2)*conjg(hh(2,2))
-    y12 = q(12,1) + q(12,2)*conjg(hh(2,2))
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = conjg(hh(i-1,1))
-       h2 = conjg(hh(i,2))
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-       x5 = x5 + q(5,i)*h1
-       y5 = y5 + q(5,i)*h2
-       x6 = x6 + q(6,i)*h1
-       y6 = y6 + q(6,i)*h2
-       x7 = x7 + q(7,i)*h1
-       y7 = y7 + q(7,i)*h2
-       x8 = x8 + q(8,i)*h1
-       y8 = y8 + q(8,i)*h2
-       x9 = x9 + q(9,i)*h1
-       y9 = y9 + q(9,i)*h2
-       x10 = x10 + q(10,i)*h1
-       y10 = y10 + q(10,i)*h2
-       x11 = x11 + q(11,i)*h1
-       y11 = y11 + q(11,i)*h2
-       x12 = x12 + q(12,i)*h1
-       y12 = y12 + q(12,i)*h2
-    enddo
-
-    x1 = x1 + q(1,nb+1)*conjg(hh(nb,1))
-    x2 = x2 + q(2,nb+1)*conjg(hh(nb,1))
-    x3 = x3 + q(3,nb+1)*conjg(hh(nb,1))
-    x4 = x4 + q(4,nb+1)*conjg(hh(nb,1))
-    x5 = x5 + q(5,nb+1)*conjg(hh(nb,1))
-    x6 = x6 + q(6,nb+1)*conjg(hh(nb,1))
-    x7 = x7 + q(7,nb+1)*conjg(hh(nb,1))
-    x8 = x8 + q(8,nb+1)*conjg(hh(nb,1))
-    x9 = x9 + q(9,nb+1)*conjg(hh(nb,1))
-    x10 = x10 + q(10,nb+1)*conjg(hh(nb,1))
-    x11 = x11 + q(11,nb+1)*conjg(hh(nb,1))
-    x12 = x12 + q(12,nb+1)*conjg(hh(nb,1))
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-    x9 = x9*h1
-    x10 = x10*h1
-    x11 = x11*h1
-    x12 = x12*h1
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-    y5 = y5*h1 + x5*h2
-    y6 = y6*h1 + x6*h2
-    y7 = y7*h1 + x7*h2
-    y8 = y8*h1 + x8*h2
-    y9 = y9*h1 + x9*h2
-    y10 = y10*h1 + x10*h2
-    y11 = y11*h1 + x11*h2
-    y12 = y12*h1 + x12*h2
-
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-    q(5,1) = q(5,1) + y5
-    q(6,1) = q(6,1) + y6
-    q(7,1) = q(7,1) + y7
-    q(8,1) = q(8,1) + y8
-    q(9,1) = q(9,1) + y9
-    q(10,1) = q(10,1) + y10
-    q(11,1) = q(11,1) + y11
-    q(12,1) = q(12,1) + y12
-
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
-    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
-    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
-    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
-    q(9,2) = q(9,2) + x9 + y9*hh(2,2)
-    q(10,2) = q(10,2) + x10 + y10*hh(2,2)
-    q(11,2) = q(11,2) + x11 + y11*hh(2,2)
-    q(12,2) = q(12,2) + x12 + y12*hh(2,2)
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-       q(5,i) = q(5,i) + x5*h1 + y5*h2
-       q(6,i) = q(6,i) + x6*h1 + y6*h2
-       q(7,i) = q(7,i) + x7*h1 + y7*h2
-       q(8,i) = q(8,i) + x8*h1 + y8*h2
-       q(9,i) = q(9,i) + x9*h1 + y9*h2
-       q(10,i) = q(10,i) + x10*h1 + y10*h2
-       q(11,i) = q(11,i) + x11*h1 + y11*h2
-       q(12,i) = q(12,i) + x12*h1 + y12*h2
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
-    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
-    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
-    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
-    q(9,nb+1) = q(9,nb+1) + x9*hh(nb,1)
-    q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
-    q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
-    q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("kernel generic: hh_trafo_complex_kernel_12_2hv")
-#endif
-
-  end subroutine hh_trafo_complex_kernel_12_2hv
-end module complex_generic_kernel
-! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_simple.F90 elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_simple.F90
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_simple.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_simple.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,177 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-!
-! This is the small and simple version (no hand unrolling of loops etc.) but for some
-! compilers this performs better than a sophisticated version with transformed and unrolled loops.
-!
-! It should be compiled with the highest possible optimization level.
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-module complex_generic_simple_kernel
-
-  private
-  public single_hh_trafo_complex_generic_simple
-contains
-  subroutine single_hh_trafo_complex_generic_simple(q, hh, nb, nq, ldq)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, nq, ldq
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(*)
-#else
-    complex(kind=ck), intent(inout) :: q(1:ldq,1:nb)
-    complex(kind=ck), intent(in)    :: hh(1:nb)
-#endif
-    integer(kind=ik)                :: i
-    complex(kind=ck)                :: h1, tau1, x(nq)
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel complex generic simple: single_hh_trafo_complex_generic_simple")
-#endif
-    ! Just one Householder transformation
-
-    x(1:nq) = q(1:nq,1)
-
-    do i=2,nb
-       x(1:nq) = x(1:nq) + q(1:nq,i)*conjg(hh(i))
-    enddo
-
-    tau1 = hh(1)
-    x(1:nq) = x(1:nq)*(-tau1)
-
-    q(1:nq,1) = q(1:nq,1) + x(1:nq)
-
-    do i=2,nb
-       q(1:nq,i) = q(1:nq,i) + x(1:nq)*hh(i)
-    enddo
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel complex generic simple: single_hh_trafo_complex_generic_simple")
-#endif
-  end subroutine single_hh_trafo_complex_generic_simple
-
-  ! --------------------------------------------------------------------------------------------------
-  subroutine double_hh_trafo_complex_generic_simple(q, hh, nb, nq, ldq, ldh)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, nq, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq,*)
-    complex(kind=ck), intent(in)    :: hh(ldh,*)
-#else
-    complex(kind=ck), intent(inout) :: q(1:ldq,1:nb+1)
-    complex(kind=ck), intent(in)    :: hh(1:ldh,1:2)
-#endif
-    complex(kind=ck)                :: s, h1, h2, tau1, tau2, x(nq), y(nq)
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel complex generic simple: double_hh_trafo_complex_generic_simple")
-#endif
-    ! Calculate dot product of the two Householder vectors
-
-    s = conjg(hh(2,2))*1
-    do i=3,nb
-       s = s+(conjg(hh(i,2))*hh(i-1,1))
-    enddo
-
-    ! Do the Householder transformations
-
-    x(1:nq) = q(1:nq,2)
-
-    y(1:nq) = q(1:nq,1) + q(1:nq,2)*conjg(hh(2,2))
-
-    do i=3,nb
-       h1 = conjg(hh(i-1,1))
-       h2 = conjg(hh(i,2))
-       x(1:nq) = x(1:nq) + q(1:nq,i)*h1
-       y(1:nq) = y(1:nq) + q(1:nq,i)*h2
-    enddo
-
-    x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x(1:nq) = x(1:nq)*h1
-    h1 = -tau2
-    h2 = -tau2*s
-    y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
-
-    q(1:nq,1) = q(1:nq,1) + y(1:nq)
-    q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
-
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
-    enddo
-
-    q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel complex generic simple: double_hh_trafo_complex_generic_simple")
-#endif
-  end subroutine double_hh_trafo_complex_generic_simple
-end module complex_generic_simple_kernel
-! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,548 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-#include <complex.h>
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline))
-
-#ifdef HAVE_SSE_INTRINSICS
-#undef __AVX__
-#endif
-
-
-//Forward declaration
-static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq);
-static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq);
-static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq);
-
-/*
-!f>#ifdef HAVE_SSE_INTRINSICS
-!f> interface
-!f>   subroutine single_hh_trafo_complex_sse_1hv(q, hh, pnb, pnq, pldq) bind(C, name="single_hh_trafo_complex_sse_1hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq
-!f>     complex(kind=c_double)     :: q(*)
-!f>     complex(kind=c_double)     :: hh(pnb,2)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void single_hh_trafo_complex_sse_1hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	//int ldh = *pldh;
-
-	for (i = 0; i < nq-4; i+=6)
-	{
-		hh_trafo_complex_kernel_6_SSE_1hv(&q[i], hh, nb, ldq);
-	}
-	if (nq-i > 2)
-	{
-		hh_trafo_complex_kernel_4_SSE_1hv(&q[i], hh, nb, ldq);
-	}
-	else if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_2_SSE_1hv(&q[i], hh, nb, ldq);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m128d x1, x2, x3, x4, x5, x6;
-	__m128d q1, q2, q3, q4, q5, q6;
-	__m128d h1_real, h1_imag;
-	__m128d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[0]);
-	x2 = _mm_load_pd(&q_dbl[2]);
-	x3 = _mm_load_pd(&q_dbl[4]);
-	x4 = _mm_load_pd(&q_dbl[6]);
-	x5 = _mm_load_pd(&q_dbl[8]);
-	x6 = _mm_load_pd(&q_dbl[10]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-		q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]);
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp5 = _mm_mul_pd(h1_imag, q5);
-#ifdef __ELPA_USE_FMA__
-		x5 = _mm_add_pd(x5, _mm_msubadd_pd(h1_real, q5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))));
-#else
-		x5 = _mm_add_pd(x5, _mm_addsub_pd( _mm_mul_pd(h1_real, q5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp6 = _mm_mul_pd(h1_imag, q6);
-#ifdef __ELPA_USE_FMA__
-		x6 = _mm_add_pd(x6, _mm_msubadd_pd(h1_real, q6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))));
-#else
-		x6 = _mm_add_pd(x6, _mm_addsub_pd( _mm_mul_pd(h1_real, q6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#else
-	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp5 = _mm_mul_pd(h1_imag, x5);
-#ifdef __ELPA_USE_FMA__
-	x5 = _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)));
-#else
-	x5 = _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp6 = _mm_mul_pd(h1_imag, x6);
-#ifdef __ELPA_USE_FMA__
-	x6 = _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)));
-#else
-	x6 = _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-	q3 = _mm_load_pd(&q_dbl[4]);
-	q4 = _mm_load_pd(&q_dbl[6]);
-	q5 = _mm_load_pd(&q_dbl[8]);
-	q6 = _mm_load_pd(&q_dbl[10]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-	q3 = _mm_add_pd(q3, x3);
-	q4 = _mm_add_pd(q4, x4);
-	q5 = _mm_add_pd(q5, x5);
-	q6 = _mm_add_pd(q6, x6);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-	_mm_store_pd(&q_dbl[4], q3);
-	_mm_store_pd(&q_dbl[6], q4);
-	_mm_store_pd(&q_dbl[8], q5);
-	_mm_store_pd(&q_dbl[10], q6);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-		q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]);
-		q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp5 = _mm_mul_pd(h1_imag, x5);
-#ifdef __ELPA_USE_FMA__
-		q5 = _mm_add_pd(q5, _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))));
-#else
-		q5 = _mm_add_pd(q5, _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp6 = _mm_mul_pd(h1_imag, x6);
-#ifdef __ELPA_USE_FMA__
-		q6 = _mm_add_pd(q6, _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))));
-#else
-		q6 = _mm_add_pd(q6, _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+8], q5);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+10], q6);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m128d x1, x2, x3, x4;
-	__m128d q1, q2, q3, q4;
-	__m128d h1_real, h1_imag;
-	__m128d tmp1, tmp2, tmp3, tmp4;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[0]);
-	x2 = _mm_load_pd(&q_dbl[2]);
-	x3 = _mm_load_pd(&q_dbl[4]);
-	x4 = _mm_load_pd(&q_dbl[6]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#else
-	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-	q3 = _mm_load_pd(&q_dbl[4]);
-	q4 = _mm_load_pd(&q_dbl[6]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-	q3 = _mm_add_pd(q3, x3);
-	q4 = _mm_add_pd(q4, x4);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-	_mm_store_pd(&q_dbl[4], q3);
-	_mm_store_pd(&q_dbl[6], q4);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
-	}
-}
-
-static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-
-	__m128d x1, x2;
-	__m128d q1, q2;
-	__m128d h1_real, h1_imag;
-	__m128d tmp1, tmp2;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[0]);
-	x2 = _mm_load_pd(&q_dbl[2]);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-
-	for (i = 1; i < nb; i++)
-	{
-		h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
-
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-	}
-}
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1378 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-#include "config-f90.h"
-
-#include <complex.h>
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline))
-
-#ifdef HAVE_SSE_INTRINSICS
-#undef __AVX__
-#endif
-
-
-//Forward declaration
-static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
-static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
-static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
-static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
-
-/*
-!f>#ifdef HAVE_SSE_INTRINSICS
-!f> interface
-!f>   subroutine double_hh_trafo_complex_sse_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_complex_sse_2hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
-!f>     complex(kind=c_double)     :: q(*)
-!f>     complex(kind=c_double)     :: hh(pnb,2)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void double_hh_trafo_complex_sse_2hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	double complex s = conj(hh[(ldh)+1])*1.0;
-	for (i = 2; i < nb; i++)
-	{
-		s += hh[i-1] * conj(hh[(i+ldh)]);
-	}
-
-#if 1
-	for (i = 0; i < nq; i+=4)
-	{
-		hh_trafo_complex_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#else
-	for (i = 0; i < nq-2; i+=3)
-	{
-		hh_trafo_complex_kernel_3_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	if (nq-i > 1)
-	{
-		hh_trafo_complex_kernel_2_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i > 0)
-	{
-		hh_trafo_complex_kernel_1_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-#endif
-}
-
-static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m128d x1, x2, x3, x4;
-	__m128d y1, y2, y3, y4;
-	__m128d q1, q2, q3, q4;
-	__m128d h1_real, h1_imag, h2_real, h2_imag;
-	__m128d tmp1, tmp2, tmp3, tmp4;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]);
-	x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
-	x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm_load_pd(&q_dbl[0]);
-	y2 = _mm_load_pd(&q_dbl[2]);
-	y3 = _mm_load_pd(&q_dbl[4]);
-	y4 = _mm_load_pd(&q_dbl[6]);
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h2_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h2_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h2_imag, q4);
-#ifdef __ELPA_USE_FMA__
-		y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
-
-	tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, q4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#else
-	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-	h2_real = _mm_xor_pd(h2_real, sign);
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-
-	tmp2 = _mm_loadu_pd(s_dbl);
-	tmp1 = _mm_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	_mm_storeu_pd(s_dbl, tmp2);
-	h2_real = _mm_loaddup_pd(&s_dbl[0]);
-	h2_imag = _mm_loaddup_pd(&s_dbl[1]);
-
-	tmp1 = _mm_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, y4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#else
-	y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
-#endif
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h2_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-	q3 = _mm_load_pd(&q_dbl[4]);
-	q4 = _mm_load_pd(&q_dbl[6]);
-
-	q1 = _mm_add_pd(q1, y1);
-	q2 = _mm_add_pd(q2, y2);
-	q3 = _mm_add_pd(q3, y3);
-	q4 = _mm_add_pd(q4, y4);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-	_mm_store_pd(&q_dbl[4], q3);
-	_mm_store_pd(&q_dbl[6], q4);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]);
-	q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]);
-	q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-	q3 = _mm_add_pd(q3, x3);
-	q4 = _mm_add_pd(q4, x4);
-
-	tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h2_imag, y4);
-#ifdef __ELPA_USE_FMA__
-	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm_store_pd(&q_dbl[(ldq*2)+2], q2);
-	_mm_store_pd(&q_dbl[(ldq*2)+4], q3);
-	_mm_store_pd(&q_dbl[(ldq*2)+6], q4);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp4 = _mm_mul_pd(h2_imag, y4);
-#ifdef __ELPA_USE_FMA__
-		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
-	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp4 = _mm_mul_pd(h1_imag, x4);
-#ifdef __ELPA_USE_FMA__
-	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#else
-	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4);
-}
-
-static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m128d x1, x2, x3;
-	__m128d y1, y2, y3;
-	__m128d q1, q2, q3;
-	__m128d h1_real, h1_imag, h2_real, h2_imag;
-	__m128d tmp1, tmp2, tmp3;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]);
-	x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm_load_pd(&q_dbl[0]);
-	y2 = _mm_load_pd(&q_dbl[2]);
-	y3 = _mm_load_pd(&q_dbl[4]);
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h2_imag, q3);
-#ifdef __ELPA_USE_FMA__
-		y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
-
-	tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, q3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-	h2_real = _mm_xor_pd(h2_real, sign);
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-
-	tmp2 = _mm_loadu_pd(s_dbl);
-	tmp1 = _mm_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	_mm_storeu_pd(s_dbl, tmp2);
-	h2_real = _mm_loaddup_pd(&s_dbl[0]);
-	h2_imag = _mm_loaddup_pd(&s_dbl[1]);
-
-	tmp1 = _mm_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#else
-	y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
-#endif
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-	q3 = _mm_load_pd(&q_dbl[4]);
-
-	q1 = _mm_add_pd(q1, y1);
-	q2 = _mm_add_pd(q2, y2);
-	q3 = _mm_add_pd(q3, y3);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-	_mm_store_pd(&q_dbl[4], q3);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]);
-	q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-	q3 = _mm_add_pd(q3, x3);
-
-	tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm_store_pd(&q_dbl[(ldq*2)+2], q2);
-	_mm_store_pd(&q_dbl[(ldq*2)+4], q3);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp3 = _mm_mul_pd(h2_imag, y3);
-#ifdef __ELPA_USE_FMA__
-		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp3 = _mm_mul_pd(h1_imag, x3);
-#ifdef __ELPA_USE_FMA__
-	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#else
-	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
-}
-
-static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m128d x1, x2;
-	__m128d y1, y2;
-	__m128d q1, q2;
-	__m128d h1_real, h1_imag, h2_real, h2_imag;
-	__m128d tmp1, tmp2;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
-	x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm_load_pd(&q_dbl[0]);
-	y2 = _mm_load_pd(&q_dbl[2]);
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, q2);
-#ifdef __ELPA_USE_FMA__
-		y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-
-	tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, q2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-	h2_real = _mm_xor_pd(h2_real, sign);
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-
-	tmp2 = _mm_loadu_pd(s_dbl);
-	tmp1 = _mm_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	_mm_storeu_pd(s_dbl, tmp2);
-	h2_real = _mm_loaddup_pd(&s_dbl[0]);
-	h2_imag = _mm_loaddup_pd(&s_dbl[1]);
-
-	tmp1 = _mm_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#else
-	y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)));
-#endif
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-	q2 = _mm_load_pd(&q_dbl[2]);
-
-	q1 = _mm_add_pd(q1, y1);
-	q2 = _mm_add_pd(q2, y2);
-
-	_mm_store_pd(&q_dbl[0], q1);
-	_mm_store_pd(&q_dbl[2], q2);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
-	q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]);
-
-	q1 = _mm_add_pd(q1, x1);
-	q2 = _mm_add_pd(q2, x2);
-
-	tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(ldq*2)+0], q1);
-	_mm_store_pd(&q_dbl[(ldq*2)+2], q2);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-		q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-		tmp2 = _mm_mul_pd(h2_imag, y2);
-#ifdef __ELPA_USE_FMA__
-		q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-		q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-		_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-	q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	tmp2 = _mm_mul_pd(h1_imag, x2);
-#ifdef __ELPA_USE_FMA__
-	q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#else
-	q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2);
-}
-
-static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
-{
-	double* q_dbl = (double*)q;
-	double* hh_dbl = (double*)hh;
-	double* s_dbl = (double*)(&s);
-
-	__m128d x1;
-	__m128d y1;
-	__m128d q1;
-	__m128d h1_real, h1_imag, h2_real, h2_imag;
-	__m128d tmp1;
-	int i=0;
-
-	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
-
-	x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-	y1 = _mm_load_pd(&q_dbl[0]);
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-		// conjugate
-		h2_imag = _mm_xor_pd(h2_imag, sign);
-#endif
-
-		tmp1 = _mm_mul_pd(h2_imag, q1);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-#ifndef __ELPA_USE_FMA__
-	// conjugate
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-
-	tmp1 = _mm_mul_pd(h1_imag, q1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[0]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-	h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
-
-	h1_real = _mm_xor_pd(h1_real, sign);
-	h1_imag = _mm_xor_pd(h1_imag, sign);
-	h2_real = _mm_xor_pd(h2_real, sign);
-	h2_imag = _mm_xor_pd(h2_imag, sign);
-
-	__m128d tmp2 = _mm_loadu_pd(s_dbl);
-	tmp1 = _mm_mul_pd(h2_imag, tmp2);
-#ifdef __ELPA_USE_FMA__
-	tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-	_mm_storeu_pd(s_dbl, tmp2);
-	h2_real = _mm_loaddup_pd(&s_dbl[0]);
-	h2_imag = _mm_loaddup_pd(&s_dbl[1]);
-
-	tmp1 = _mm_mul_pd(h1_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#else
-	y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)));
-#endif
-
-	tmp1 = _mm_mul_pd(h2_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	q1 = _mm_load_pd(&q_dbl[0]);
-
-	q1 = _mm_add_pd(q1, y1);
-
-	_mm_store_pd(&q_dbl[0], q1);
-
-	h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
-	h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]);
-
-	q1 = _mm_add_pd(q1, x1);
-
-	tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(ldq*2)+0], q1);
-
-	for (i = 2; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]);
-
-		h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]);
-		h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-		h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
-		h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
-
-		tmp1 = _mm_mul_pd(h2_imag, y1);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-		q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-		_mm_store_pd(&q_dbl[(2*i*ldq)+0], q1);
-	}
-
-	h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
-	h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
-
-	q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]);
-
-	tmp1 = _mm_mul_pd(h1_imag, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#else
-	q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))));
-#endif
-
-	_mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
-}
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,971 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-#ifdef HAVE_AVX2
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
-#endif
-
-#endif
-
-//Forward declaration
-__forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-
-/*
-!f>#ifdef HAVE_AVX
-!f> interface
-!f>   subroutine double_hh_trafo_real_avx_avx2_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_real_avx_avx2_2hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
-!f>     real(kind=c_double)     :: q(*)
-!f>     real(kind=c_double)     :: hh(pnb,6)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void double_hh_trafo_real_avx_avx2_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-
-void double_hh_trafo_real_avx_avx2_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar product to compute
-	// 2 householder vectors simultaneously
-	double s = hh[(ldh)+1]*1.0;
-
-	#pragma ivdep
-	for (i = 2; i < nb; i++)
-	{
-		s += hh[i-1] * hh[(i+ldh)];
-	}
-
-	// Production level kernel calls with padding
-	for (i = 0; i < nq-20; i+=24)
-	{
-		hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-
-	if (nq == i)
-	{
-		return;
-	}
-
-	if (nq-i == 20)
-	{
-		hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-		hh_trafo_kernel_4_AVX_2hv(&q[i+16], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i == 16)
-	{
-		hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i == 12)
-	{
-		hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-		hh_trafo_kernel_4_AVX_2hv(&q[i+8], hh, nb, ldq, ldh, s);
-	}
-	else if (nq-i == 8)
-	{
-		hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	else
-	{
-		hh_trafo_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-}
-/**
- * Unrolled kernel that computes
- * 24 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [24 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
-
-	__m256d x1 = _mm256_load_pd(&q[ldq]);
-	__m256d x2 = _mm256_load_pd(&q[ldq+4]);
-	__m256d x3 = _mm256_load_pd(&q[ldq+8]);
-	__m256d x4 = _mm256_load_pd(&q[ldq+12]);
-	__m256d x5 = _mm256_load_pd(&q[ldq+16]);
-	__m256d x6 = _mm256_load_pd(&q[ldq+20]);
-
-	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
-	__m256d q3 = _mm256_load_pd(&q[8]);
-	__m256d y3 = _mm256_FMA_pd(x3, h1, q3);
-	__m256d q4 = _mm256_load_pd(&q[12]);
-	__m256d y4 = _mm256_FMA_pd(x4, h1, q4);
-	__m256d q5 = _mm256_load_pd(&q[16]);
-	__m256d y5 = _mm256_FMA_pd(x5, h1, q5);
-	__m256d q6 = _mm256_load_pd(&q[20]);
-	__m256d y6 = _mm256_FMA_pd(x6, h1, q6);
-#else
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	__m256d q3 = _mm256_load_pd(&q[8]);
-	__m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
-	__m256d q4 = _mm256_load_pd(&q[12]);
-	__m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
-	__m256d q5 = _mm256_load_pd(&q[16]);
-	__m256d y5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1));
-	__m256d q6 = _mm256_load_pd(&q[20]);
-	__m256d y6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		x3 = _mm256_FMA_pd(q3, h1, x3);
-		y3 = _mm256_FMA_pd(q3, h2, y3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		x4 = _mm256_FMA_pd(q4, h1, x4);
-		y4 = _mm256_FMA_pd(q4, h2, y4);
-		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
-		x5 = _mm256_FMA_pd(q5, h1, x5);
-		y5 = _mm256_FMA_pd(q5, h2, y5);
-		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
-		x6 = _mm256_FMA_pd(q6, h1, x6);
-		y6 = _mm256_FMA_pd(q6, h2, y6);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-		y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
-		y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2));
-		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
-		x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1));
-		y5 = _mm256_add_pd(y5, _mm256_mul_pd(q5,h2));
-		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
-		x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1));
-		y6 = _mm256_add_pd(y6, _mm256_mul_pd(q6,h2));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	x4 = _mm256_FMA_pd(q4, h1, x4);
-	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
-	x5 = _mm256_FMA_pd(q5, h1, x5);
-	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
-	x6 = _mm256_FMA_pd(q6, h1, x6);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
-	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
-	x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1));
-	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
-	x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [24 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(hh);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs = _mm256_broadcast_sd(&s);
-
-	h1 = _mm256_xor_pd(tau1, sign);
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-	x3 = _mm256_mul_pd(x3, h1);
-	x4 = _mm256_mul_pd(x4, h1);
-	x5 = _mm256_mul_pd(x5, h1);
-	x6 = _mm256_mul_pd(x6, h1);
-	h1 = _mm256_xor_pd(tau2, sign);
-	h2 = _mm256_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
-	y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2));
-	y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2));
-	y5 = _mm256_FMA_pd(y5, h1, _mm256_mul_pd(x5,h2));
-	y6 = _mm256_FMA_pd(y6, h1, _mm256_mul_pd(x6,h2));
-#else
-	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-	y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
-	y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2));
-	y5 = _mm256_add_pd(_mm256_mul_pd(y5,h1), _mm256_mul_pd(x5,h2));
-	y6 = _mm256_add_pd(_mm256_mul_pd(y6,h1), _mm256_mul_pd(x6,h2));
-#endif
-
-	q1 = _mm256_load_pd(q);
-	q1 = _mm256_add_pd(q1, y1);
-	_mm256_store_pd(q,q1);
-	q2 = _mm256_load_pd(&q[4]);
-	q2 = _mm256_add_pd(q2, y2);
-	_mm256_store_pd(&q[4],q2);
-	q3 = _mm256_load_pd(&q[8]);
-	q3 = _mm256_add_pd(q3, y3);
-	_mm256_store_pd(&q[8],q3);
-	q4 = _mm256_load_pd(&q[12]);
-	q4 = _mm256_add_pd(q4, y4);
-	_mm256_store_pd(&q[12],q4);
-	q5 = _mm256_load_pd(&q[16]);
-	q5 = _mm256_add_pd(q5, y5);
-	_mm256_store_pd(&q[16],q5);
-	q6 = _mm256_load_pd(&q[20]);
-	q6 = _mm256_add_pd(q6, y6);
-	_mm256_store_pd(&q[20],q6);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
-	_mm256_store_pd(&q[ldq+4],q2);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-	q3 = _mm256_add_pd(q3, _mm256_FMA_pd(y3, h2, x3));
-	_mm256_store_pd(&q[ldq+8],q3);
-	q4 = _mm256_load_pd(&q[ldq+12]);
-	q4 = _mm256_add_pd(q4, _mm256_FMA_pd(y4, h2, x4));
-	_mm256_store_pd(&q[ldq+12],q4);
-	q5 = _mm256_load_pd(&q[ldq+16]);
-	q5 = _mm256_add_pd(q5, _mm256_FMA_pd(y5, h2, x5));
-	_mm256_store_pd(&q[ldq+16],q5);
-	q6 = _mm256_load_pd(&q[ldq+20]);
-	q6 = _mm256_add_pd(q6, _mm256_FMA_pd(y6, h2, x6));
-	_mm256_store_pd(&q[ldq+20],q6);
-#else
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
-	_mm256_store_pd(&q[ldq+4],q2);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-	q3 = _mm256_add_pd(q3, _mm256_add_pd(x3, _mm256_mul_pd(y3, h2)));
-	_mm256_store_pd(&q[ldq+8],q3);
-	q4 = _mm256_load_pd(&q[ldq+12]);
-	q4 = _mm256_add_pd(q4, _mm256_add_pd(x4, _mm256_mul_pd(y4, h2)));
-	_mm256_store_pd(&q[ldq+12],q4);
-	q5 = _mm256_load_pd(&q[ldq+16]);
-	q5 = _mm256_add_pd(q5, _mm256_add_pd(x5, _mm256_mul_pd(y5, h2)));
-	_mm256_store_pd(&q[ldq+16],q5);
-	q6 = _mm256_load_pd(&q[ldq+20]);
-	q6 = _mm256_add_pd(q6, _mm256_add_pd(x6, _mm256_mul_pd(y6, h2)));
-	_mm256_store_pd(&q[ldq+20],q6);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_FMA_pd(x1, h1, q1);
-		q1 = _mm256_FMA_pd(y1, h2, q1);
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_FMA_pd(x2, h1, q2);
-		q2 = _mm256_FMA_pd(y2, h2, q2);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		q3 = _mm256_FMA_pd(x3, h1, q3);
-		q3 = _mm256_FMA_pd(y3, h2, q3);
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		q4 = _mm256_FMA_pd(x4, h1, q4);
-		q4 = _mm256_FMA_pd(y4, h2, q4);
-		_mm256_store_pd(&q[(i*ldq)+12],q4);
-		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
-		q5 = _mm256_FMA_pd(x5, h1, q5);
-		q5 = _mm256_FMA_pd(y5, h2, q5);
-		_mm256_store_pd(&q[(i*ldq)+16],q5);
-		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
-		q6 = _mm256_FMA_pd(x6, h1, q6);
-		q6 = _mm256_FMA_pd(y6, h2, q6);
-		_mm256_store_pd(&q[(i*ldq)+20],q6);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		q3 = _mm256_add_pd(q3, _mm256_add_pd(_mm256_mul_pd(x3,h1), _mm256_mul_pd(y3, h2)));
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		q4 = _mm256_add_pd(q4, _mm256_add_pd(_mm256_mul_pd(x4,h1), _mm256_mul_pd(y4, h2)));
-		_mm256_store_pd(&q[(i*ldq)+12],q4);
-		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
-		q5 = _mm256_add_pd(q5, _mm256_add_pd(_mm256_mul_pd(x5,h1), _mm256_mul_pd(y5, h2)));
-		_mm256_store_pd(&q[(i*ldq)+16],q5);
-		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
-		q6 = _mm256_add_pd(q6, _mm256_add_pd(_mm256_mul_pd(x6,h1), _mm256_mul_pd(y6, h2)));
-		_mm256_store_pd(&q[(i*ldq)+20],q6);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_FMA_pd(x1, h1, q1);
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_FMA_pd(x2, h1, q2);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	q3 = _mm256_FMA_pd(x3, h1, q3);
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	q4 = _mm256_FMA_pd(x4, h1, q4);
-	_mm256_store_pd(&q[(nb*ldq)+12],q4);
-	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
-	q5 = _mm256_FMA_pd(x5, h1, q5);
-	_mm256_store_pd(&q[(nb*ldq)+16],q5);
-	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
-	q6 = _mm256_FMA_pd(x6, h1, q6);
-	_mm256_store_pd(&q[(nb*ldq)+20],q6);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	q3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	q4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
-	_mm256_store_pd(&q[(nb*ldq)+12],q4);
-	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
-	q5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1));
-	_mm256_store_pd(&q[(nb*ldq)+16],q5);
-	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
-	q6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1));
-	_mm256_store_pd(&q[(nb*ldq)+20],q6);
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 16 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [16 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
-
-	__m256d x1 = _mm256_load_pd(&q[ldq]);
-	__m256d x2 = _mm256_load_pd(&q[ldq+4]);
-	__m256d x3 = _mm256_load_pd(&q[ldq+8]);
-	__m256d x4 = _mm256_load_pd(&q[ldq+12]);
-
-	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
-	__m256d q3 = _mm256_load_pd(&q[8]);
-	__m256d y3 = _mm256_FMA_pd(x3, h1, q3);
-	__m256d q4 = _mm256_load_pd(&q[12]);
-	__m256d y4 = _mm256_FMA_pd(x4, h1, q4);
-#else
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	__m256d q3 = _mm256_load_pd(&q[8]);
-	__m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
-	__m256d q4 = _mm256_load_pd(&q[12]);
-	__m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		x3 = _mm256_FMA_pd(q3, h1, x3);
-		y3 = _mm256_FMA_pd(q3, h2, y3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		x4 = _mm256_FMA_pd(q4, h1, x4);
-		y4 = _mm256_FMA_pd(q4, h2, y4);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-		y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
-		y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	x4 = _mm256_FMA_pd(q4, h1, x4);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [16 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(hh);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs = _mm256_broadcast_sd(&s);
-
-	h1 = _mm256_xor_pd(tau1, sign);
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-	x3 = _mm256_mul_pd(x3, h1);
-	x4 = _mm256_mul_pd(x4, h1);
-	h1 = _mm256_xor_pd(tau2, sign);
-	h2 = _mm256_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
-	y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2));
-	y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2));
-#else
-	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-	y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
-	y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2));
-#endif
-
-	q1 = _mm256_load_pd(q);
-	q1 = _mm256_add_pd(q1, y1);
-	_mm256_store_pd(q,q1);
-	q2 = _mm256_load_pd(&q[4]);
-	q2 = _mm256_add_pd(q2, y2);
-	_mm256_store_pd(&q[4],q2);
-	q3 = _mm256_load_pd(&q[8]);
-	q3 = _mm256_add_pd(q3, y3);
-	_mm256_store_pd(&q[8],q3);
-	q4 = _mm256_load_pd(&q[12]);
-	q4 = _mm256_add_pd(q4, y4);
-	_mm256_store_pd(&q[12],q4);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
-	_mm256_store_pd(&q[ldq+4],q2);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-	q3 = _mm256_add_pd(q3, _mm256_FMA_pd(y3, h2, x3));
-	_mm256_store_pd(&q[ldq+8],q3);
-	q4 = _mm256_load_pd(&q[ldq+12]);
-	q4 = _mm256_add_pd(q4, _mm256_FMA_pd(y4, h2, x4));
-	_mm256_store_pd(&q[ldq+12],q4);
-#else
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
-	_mm256_store_pd(&q[ldq+4],q2);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-	q3 = _mm256_add_pd(q3, _mm256_add_pd(x3, _mm256_mul_pd(y3, h2)));
-	_mm256_store_pd(&q[ldq+8],q3);
-	q4 = _mm256_load_pd(&q[ldq+12]);
-	q4 = _mm256_add_pd(q4, _mm256_add_pd(x4, _mm256_mul_pd(y4, h2)));
-	_mm256_store_pd(&q[ldq+12],q4);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_FMA_pd(x1, h1, q1);
-		q1 = _mm256_FMA_pd(y1, h2, q1);
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_FMA_pd(x2, h1, q2);
-		q2 = _mm256_FMA_pd(y2, h2, q2);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		q3 = _mm256_FMA_pd(x3, h1, q3);
-		q3 = _mm256_FMA_pd(y3, h2, q3);
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		q4 = _mm256_FMA_pd(x4, h1, q4);
-		q4 = _mm256_FMA_pd(y4, h2, q4);
-		_mm256_store_pd(&q[(i*ldq)+12],q4);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-		q3 = _mm256_add_pd(q3, _mm256_add_pd(_mm256_mul_pd(x3,h1), _mm256_mul_pd(y3, h2)));
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
-		q4 = _mm256_add_pd(q4, _mm256_add_pd(_mm256_mul_pd(x4,h1), _mm256_mul_pd(y4, h2)));
-		_mm256_store_pd(&q[(i*ldq)+12],q4);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_FMA_pd(x1, h1, q1);
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_FMA_pd(x2, h1, q2);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	q3 = _mm256_FMA_pd(x3, h1, q3);
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	q4 = _mm256_FMA_pd(x4, h1, q4);
-	_mm256_store_pd(&q[(nb*ldq)+12],q4);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-	q3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
-	q4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
-	_mm256_store_pd(&q[(nb*ldq)+12],q4);
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 8 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [8 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
-
-	__m256d x1 = _mm256_load_pd(&q[ldq]);
-	__m256d x2 = _mm256_load_pd(&q[ldq+4]);
-
-	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
-#else
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	__m256d q2 = _mm256_load_pd(&q[4]);
-	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [8 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(hh);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs = _mm256_broadcast_sd(&s);
-
-	h1 = _mm256_xor_pd(tau1, sign);
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-	h1 = _mm256_xor_pd(tau2, sign);
-	h2 = _mm256_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
-#else
-	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-#endif
-
-	q1 = _mm256_load_pd(q);
-	q1 = _mm256_add_pd(q1, y1);
-	_mm256_store_pd(q,q1);
-	q2 = _mm256_load_pd(&q[4]);
-	q2 = _mm256_add_pd(q2, y2);
-	_mm256_store_pd(&q[4],q2);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
-	_mm256_store_pd(&q[ldq+4],q2);
-#else
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
-	_mm256_store_pd(&q[ldq],q1);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
-	_mm256_store_pd(&q[ldq+4],q2);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_FMA_pd(x1, h1, q1);
-		q1 = _mm256_FMA_pd(y1, h2, q1);
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_FMA_pd(x2, h1, q2);
-		q2 = _mm256_FMA_pd(y2, h2, q2);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
-		_mm256_store_pd(&q[i*ldq],q1);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_FMA_pd(x1, h1, q1);
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_FMA_pd(x2, h1, q2);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	_mm256_store_pd(&q[nb*ldq],q1);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);
-
-	__m256d x1 = _mm256_load_pd(&q[ldq]);
-
-	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h2;
-
-#ifdef __ELPA_USE_FMA__
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
-#else
-	__m256d q1 = _mm256_load_pd(q);
-	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [4 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(hh);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs = _mm256_broadcast_sd(&s);
-
-	h1 = _mm256_xor_pd(tau1, sign);
-	x1 = _mm256_mul_pd(x1, h1);
-	h1 = _mm256_xor_pd(tau2, sign);
-	h2 = _mm256_mul_pd(h1, vs);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
-#else
-	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-#endif
-
-	q1 = _mm256_load_pd(q);
-	q1 = _mm256_add_pd(q1, y1);
-	_mm256_store_pd(q,q1);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
-	_mm256_store_pd(&q[ldq],q1);
-#else
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
-	_mm256_store_pd(&q[ldq],q1);
-#endif
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-1]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_FMA_pd(x1, h1, q1);
-		q1 = _mm256_FMA_pd(y1, h2, q1);
-		_mm256_store_pd(&q[i*ldq],q1);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
-		_mm256_store_pd(&q[i*ldq],q1);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_FMA_pd(x1, h1, q1);
-	_mm256_store_pd(&q[nb*ldq],q1);
-#else
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
-	_mm256_store_pd(&q[nb*ldq],q1);
-#endif
-}
-
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1319 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-#include "config-f90.h"
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-#ifdef HAVE_AVX2
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
-#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
-#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
-#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
-#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
-#endif
-
-#endif
-
-//Forward declaration
-__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-
-/*
-!f>#ifdef HAVE_AVX
-!f> interface
-!f>   subroutine quad_hh_trafo_real_avx_avx2_4hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="quad_hh_trafo_real_avx_avx2_4hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
-!f>     real(kind=c_double)     :: q(*)
-!f>     real(kind=c_double)     :: hh(pnb,6)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void quad_hh_trafo_real_avx_avx2_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-
-void quad_hh_trafo_real_avx_avx2_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 4 householder vectors simultaneously
-	double s_1_2 = hh[(ldh)+1];
-	double s_1_3 = hh[(ldh*2)+2];
-	double s_2_3 = hh[(ldh*2)+1];
-	double s_1_4 = hh[(ldh*3)+3];
-	double s_2_4 = hh[(ldh*3)+2];
-	double s_3_4 = hh[(ldh*3)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	s_1_2 += hh[2-1] * hh[(2+ldh)];
-	s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
-
-	// loop counter = 3
-	s_1_2 += hh[3-1] * hh[(3+ldh)];
-	s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
-
-	s_1_3 += hh[3-2] * hh[3+(ldh*2)];
-	s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
-
-	#pragma ivdep
-	for (i = 4; i < nb; i++)
-	{
-		s_1_2 += hh[i-1] * hh[(i+ldh)];
-		s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-
-		s_1_3 += hh[i-2] * hh[i+(ldh*2)];
-		s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-
-		s_1_4 += hh[i-3] * hh[i+(ldh*3)];
-	}
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq-8; i+=12)
-	{
-		hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		if (nq-i > 4)
-		{
-			hh_trafo_kernel_8_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-		else
-		{
-			hh_trafo_kernel_4_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-	}
-#else
-	for (i = 0; i < nq-4; i+=6)
-	{
-		hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		if (nq-i > 2)
-		{
-			hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-		else
-		{
-			hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-	}
-#endif
-}
-
-/**
- * Unrolled kernel that computes
- * 12 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [12 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a4_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-	register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-	register __m256d x1 = a1_1;
-#else
-	register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-	register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-	register __m256d x1 = a1_1;
-#endif
-
-	__m256d a1_2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	__m256d a2_2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	__m256d a3_2 = _mm256_load_pd(&q[ldq+4]);
-	__m256d a4_2 = _mm256_load_pd(&q[0+4]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm256_FMA_pd(a2_2, h_4_2, w2);
-	w2 = _mm256_FMA_pd(a1_2, h_4_1, w2);
-	register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm256_FMA_pd(a1_2, h_3_1, z2);
-	register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2);
-	register __m256d x2 = a1_2;
-#else
-	register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1));
-	register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1));
-	register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1));
-	register __m256d x2 = a1_2;
-#endif
-
-	__m256d a1_3 = _mm256_load_pd(&q[(ldq*3)+8]);
-	__m256d a2_3 = _mm256_load_pd(&q[(ldq*2)+8]);
-	__m256d a3_3 = _mm256_load_pd(&q[ldq+8]);
-	__m256d a4_3 = _mm256_load_pd(&q[0+8]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m256d w3 = _mm256_FMA_pd(a3_3, h_4_3, a4_3);
-	w3 = _mm256_FMA_pd(a2_3, h_4_2, w3);
-	w3 = _mm256_FMA_pd(a1_3, h_4_1, w3);
-	register __m256d z3 = _mm256_FMA_pd(a2_3, h_3_2, a3_3);
-	z3 = _mm256_FMA_pd(a1_3, h_3_1, z3);
-	register __m256d y3 = _mm256_FMA_pd(a1_3, h_2_1, a2_3);
-	register __m256d x3 = a1_3;
-#else
-	register __m256d w3 = _mm256_add_pd(a4_3, _mm256_mul_pd(a3_3, h_4_3));
-	w3 = _mm256_add_pd(w3, _mm256_mul_pd(a2_3, h_4_2));
-	w3 = _mm256_add_pd(w3, _mm256_mul_pd(a1_3, h_4_1));
-	register __m256d z3 = _mm256_add_pd(a3_3, _mm256_mul_pd(a2_3, h_3_2));
-	z3 = _mm256_add_pd(z3, _mm256_mul_pd(a1_3, h_3_1));
-	register __m256d y3 = _mm256_add_pd(a2_3, _mm256_mul_pd(a1_3, h_2_1));
-	register __m256d x3 = a1_3;
-#endif
-
-	__m256d q1;
-	__m256d q2;
-	__m256d q3;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		x3 = _mm256_FMA_pd(q3, h1, x3);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-#endif
-
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-		y3 = _mm256_FMA_pd(q3, h2, y3);
-#else
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-		y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-#endif
-
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-		z2 = _mm256_FMA_pd(q2, h3, z2);
-		z3 = _mm256_FMA_pd(q3, h3, z3);
-#else
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-		z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-		z3 = _mm256_add_pd(z3, _mm256_mul_pd(q3,h3));
-#endif
-
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-		w2 = _mm256_FMA_pd(q2, h4, w2);
-		w3 = _mm256_FMA_pd(q3, h4, w3);
-#else
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-		w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-		w3 = _mm256_add_pd(w3, _mm256_mul_pd(q3,h4));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-#endif
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __FMA4_
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-	y3 = _mm256_FMA_pd(q3, h2, y3);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-	y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-#endif
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-	z3 = _mm256_FMA_pd(q3, h3, z3);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-	z3 = _mm256_add_pd(z3, _mm256_mul_pd(q3,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-	q3 = _mm256_load_pd(&q[((nb+1)*ldq)+8]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-#endif
-
-	h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]);
-
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-	y3 = _mm256_FMA_pd(q3, h2, y3);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-	y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-	q3 = _mm256_load_pd(&q[((nb+2)*ldq)+8]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	x3 = _mm256_FMA_pd(q3, h1, x3);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [12 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-
-	h1 = tau1;
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-	x3 = _mm256_mul_pd(x3, h1);
-
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2);
-
-	h1 = tau2;
-	h2 = _mm256_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMSUB_pd(y2, h1, _mm256_mul_pd(x2,h2));
-	y3 = _mm256_FMSUB_pd(y3, h1, _mm256_mul_pd(x3,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_sub_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-	y3 = _mm256_sub_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
-#endif
-
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3);
-
-	h1 = tau3;
-	h2 = _mm256_mul_pd(h1, vs_1_3);
-	h3 = _mm256_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_FMSUB_pd(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)));
-	z3 = _mm256_FMSUB_pd(z3, h1, _mm256_FMA_pd(y3, h3, _mm256_mul_pd(x3,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_sub_pd(_mm256_mul_pd(z2,h1), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)));
-	z3 = _mm256_sub_pd(_mm256_mul_pd(z3,h1), _mm256_add_pd(_mm256_mul_pd(y3,h3), _mm256_mul_pd(x3,h2)));
-#endif
-
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4);
-
-	h1 = tau4;
-	h2 = _mm256_mul_pd(h1, vs_1_4);
-	h3 = _mm256_mul_pd(h1, vs_2_4);
-	h4 = _mm256_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_FMSUB_pd(w2, h1, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
-	w3 = _mm256_FMSUB_pd(w3, h1, _mm256_FMA_pd(z3, h4, _mm256_FMA_pd(y3, h3, _mm256_mul_pd(x3,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_sub_pd(_mm256_mul_pd(w2,h1), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
-	w3 = _mm256_sub_pd(_mm256_mul_pd(w3,h1), _mm256_add_pd(_mm256_mul_pd(z3,h4), _mm256_add_pd(_mm256_mul_pd(y3,h3), _mm256_mul_pd(x3,h2))));
-#endif
-
-	q1 = _mm256_load_pd(&q[0]);
-	q2 = _mm256_load_pd(&q[4]);
-	q3 = _mm256_load_pd(&q[8]);
-	q1 = _mm256_sub_pd(q1, w1);
-	q2 = _mm256_sub_pd(q2, w2);
-	q3 = _mm256_sub_pd(q3, w3);
-	_mm256_store_pd(&q[0],q1);
-	_mm256_store_pd(&q[4],q2);
-	_mm256_store_pd(&q[8],q3);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-	q3 = _mm256_load_pd(&q[ldq+8]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1));
-	q2 = _mm256_sub_pd(q2, _mm256_FMA_pd(w2, h4, z2));
-	q3 = _mm256_sub_pd(q3, _mm256_FMA_pd(w3, h4, z3));
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4)));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(z2, _mm256_mul_pd(w2, h4)));
-	q3 = _mm256_sub_pd(q3, _mm256_add_pd(z3, _mm256_mul_pd(w3, h4)));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-	_mm256_store_pd(&q[ldq+4],q2);
-	_mm256_store_pd(&q[ldq+8],q3);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-	q2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	q3 = _mm256_load_pd(&q[(ldq*2)+8]);
-	q1 = _mm256_sub_pd(q1, y1);
-	q2 = _mm256_sub_pd(q2, y2);
-	q3 = _mm256_sub_pd(q3, y3);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-	q3 = _mm256_NFMA_pd(w3, h4, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3, h4));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-	q3 = _mm256_NFMA_pd(z3, h3, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3, h3));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-	_mm256_store_pd(&q[(ldq*2)+4],q2);
-	_mm256_store_pd(&q[(ldq*2)+8],q3);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-	q2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	q3 = _mm256_load_pd(&q[(ldq*3)+8]);
-	q1 = _mm256_sub_pd(q1, x1);
-	q2 = _mm256_sub_pd(q2, x2);
-	q3 = _mm256_sub_pd(q3, x3);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-	q3 = _mm256_NFMA_pd(w3, h4, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3, h4));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-	q3 = _mm256_NFMA_pd(y3, h2, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-	q3 = _mm256_NFMA_pd(z3, h3, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3, h3));
-#endif
-	_mm256_store_pd(&q[ldq*3], q1);
-	_mm256_store_pd(&q[(ldq*3)+4], q2);
-	_mm256_store_pd(&q[(ldq*3)+8], q3);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
-
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(x1, h1, q1);
-		q2 = _mm256_NFMA_pd(x2, h1, q2);
-		q3 = _mm256_NFMA_pd(x3, h1, q3);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1));
-		q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1));
-#endif
-
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(y1, h2, q1);
-		q2 = _mm256_NFMA_pd(y2, h2, q2);
-		q3 = _mm256_NFMA_pd(y3, h2, q3);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2));
-		q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2));
-#endif
-
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(z1, h3, q1);
-		q2 = _mm256_NFMA_pd(z2, h3, q2);
-		q3 = _mm256_NFMA_pd(z3, h3, q3);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1,h3));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2,h3));
-		q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3,h3));
-#endif
-
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(w1, h4, q1);
-		q2 = _mm256_NFMA_pd(w2, h4, q2);
-		q3 = _mm256_NFMA_pd(w3, h4, q3);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1,h4));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2,h4));
-		q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3,h4));
-#endif
-
-		_mm256_store_pd(&q[i*ldq],q1);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-		_mm256_store_pd(&q[(i*ldq)+8],q3);
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-	q3 = _mm256_NFMA_pd(x3, h1, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-	q3 = _mm256_NFMA_pd(y3, h2, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-	q3 = _mm256_NFMA_pd(z3, h3, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1,h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2,h3));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3,h3));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-	_mm256_store_pd(&q[(nb*ldq)+8],q3);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-	q3 = _mm256_load_pd(&q[((nb+1)*ldq)+8]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-	q3 = _mm256_NFMA_pd(x3, h1, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-	q3 = _mm256_NFMA_pd(y3, h2, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-	_mm256_store_pd(&q[((nb+1)*ldq)+4],q2);
-	_mm256_store_pd(&q[((nb+1)*ldq)+8],q3);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-	q3 = _mm256_load_pd(&q[((nb+2)*ldq)+8]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-	q3 = _mm256_NFMA_pd(x3, h1, q3);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1));
-	q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-	_mm256_store_pd(&q[((nb+2)*ldq)+4],q2);
-	_mm256_store_pd(&q[((nb+2)*ldq)+8],q3);
-}
-
-/**
- * Unrolled kernel that computes
- * 8 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a4_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	__m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-	__m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	__m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-	__m256d x1 = a1_1;
-#else
-	__m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-	__m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	__m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-	__m256d x1 = a1_1;
-#endif
-
-	__m256d a1_2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	__m256d a2_2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	__m256d a3_2 = _mm256_load_pd(&q[ldq+4]);
-	__m256d a4_2 = _mm256_load_pd(&q[0+4]);
-
-#ifdef __ELPA_USE_FMA__
-	__m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm256_FMA_pd(a2_2, h_4_2, w2);
-	w2 = _mm256_FMA_pd(a1_2, h_4_1, w2);
-	__m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm256_FMA_pd(a1_2, h_3_1, z2);
-	__m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2);
-	__m256d x2 = a1_2;
-#else
-	__m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1));
-	__m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1));
-	__m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1));
-	__m256d x2 = a1_2;
-#endif
-
-	__m256d q1;
-	__m256d q2;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-
-		q1 = _mm256_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-		z2 = _mm256_FMA_pd(q2, h3, z2);
-		w2 = _mm256_FMA_pd(q2, h4, w2);
-#else
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-		z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-		w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]);
-
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [8 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-
-	__m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4);
-
-	h1 = tau1;
-	x1 = _mm256_mul_pd(x1, h1);
-	x2 = _mm256_mul_pd(x2, h1);
-
-	h1 = tau2;
-	h2 = _mm256_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMSUB_pd(y2, h1, _mm256_mul_pd(x2,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_sub_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
-#endif
-
-	h1 = tau3;
-	h2 = _mm256_mul_pd(h1, vs_1_3);
-	h3 = _mm256_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_FMSUB_pd(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_sub_pd(_mm256_mul_pd(z2,h1), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)));
-#endif
-
-	h1 = tau4;
-	h2 = _mm256_mul_pd(h1, vs_1_4);
-	h3 = _mm256_mul_pd(h1, vs_2_4);
-	h4 = _mm256_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_FMSUB_pd(w2, h1, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_sub_pd(_mm256_mul_pd(w2,h1), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
-#endif
-
-	q1 = _mm256_load_pd(&q[0]);
-	q2 = _mm256_load_pd(&q[4]);
-	q1 = _mm256_sub_pd(q1, w1);
-	q2 = _mm256_sub_pd(q2, w2);
-	_mm256_store_pd(&q[0],q1);
-	_mm256_store_pd(&q[4],q2);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-	q2 = _mm256_load_pd(&q[ldq+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1));
-	q2 = _mm256_sub_pd(q2, _mm256_FMA_pd(w2, h4, z2));
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4)));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(z2, _mm256_mul_pd(w2, h4)));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-	_mm256_store_pd(&q[ldq+4],q2);
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-	q2 = _mm256_load_pd(&q[(ldq*2)+4]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_sub_pd(q1, y1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q1 = _mm256_NFMA_pd(w1, h4, q1);
-        q2 = _mm256_sub_pd(q2, y2);
-        q2 = _mm256_NFMA_pd(z2, h3, q2);
-        q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(y2, _mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(w2, h4))));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-	_mm256_store_pd(&q[(ldq*2)+4],q2);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-	q2 = _mm256_load_pd(&q[(ldq*3)+4]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_sub_pd(q1, x1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q1 = _mm256_NFMA_pd(w1, h4, q1);
-        q2 = _mm256_sub_pd(q2, x2);
-        q2 = _mm256_NFMA_pd(y2, h2, q2);
-        q2 = _mm256_NFMA_pd(z2, h3, q2);
-        q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(x2, _mm256_add_pd(_mm256_mul_pd(y2, h2), _mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(w2, h4)))));
-#endif
-	_mm256_store_pd(&q[ldq*3], q1);
-	_mm256_store_pd(&q[(ldq*3)+4], q2);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-                q1 = _mm256_NFMA_pd(x1, h1, q1);
-                q1 = _mm256_NFMA_pd(y1, h2, q1);
-                q1 = _mm256_NFMA_pd(z1, h3, q1);
-                q1 = _mm256_NFMA_pd(w1, h4, q1);
-                q2 = _mm256_NFMA_pd(x2, h1, q2);
-                q2 = _mm256_NFMA_pd(y2, h2, q2);
-                q2 = _mm256_NFMA_pd(z2, h3, q2);
-                q2 = _mm256_NFMA_pd(w2, h4, q2);
-		_mm256_store_pd(&q[i*ldq],q1);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-#else
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))));
-		_mm256_store_pd(&q[i*ldq],q1);
-
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		q2 = _mm256_sub_pd(q2, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2, h4), _mm256_mul_pd(z2, h3)), _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2))));
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_NFMA_pd(x1, h1, q1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q2 = _mm256_NFMA_pd(x2, h1, q2);
-        q2 = _mm256_NFMA_pd(y2, h2, q2);
-        q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1)));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(y2, h2)) , _mm256_mul_pd(x2, h1)));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_NFMA_pd(x1, h1, q1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q2 = _mm256_NFMA_pd(x2, h1, q2);
-        q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1)));
-	q2 = _mm256_sub_pd(q2, _mm256_add_pd( _mm256_mul_pd(y2, h2) , _mm256_mul_pd(x2, h1)));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-	_mm256_store_pd(&q[((nb+1)*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-	_mm256_store_pd(&q[((nb+2)*ldq)+4],q2);
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a4_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-
-#ifdef __ELPA_USE_FMA__
-	__m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-	__m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	__m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-	__m256d x1 = a1_1;
-#else
-	__m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-	__m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	__m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-	__m256d x1 = a1_1;
-#endif
-
-	__m256d q1;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-
-		q1 = _mm256_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-#ifdef _FMA4__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [4 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-
-	__m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4);
-
-	h1 = tau1;
-	x1 = _mm256_mul_pd(x1, h1);
-
-	h1 = tau2;
-	h2 = _mm256_mul_pd(h1, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
-#endif
-
-	h1 = tau3;
-	h2 = _mm256_mul_pd(h1, vs_1_3);
-	h3 = _mm256_mul_pd(h1, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-#endif
-
-	h1 = tau4;
-	h2 = _mm256_mul_pd(h1, vs_1_4);
-	h3 = _mm256_mul_pd(h1, vs_2_4);
-	h4 = _mm256_mul_pd(h1, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-#endif
-
-	q1 = _mm256_load_pd(&q[0]);
-	q1 = _mm256_sub_pd(q1, w1);
-	_mm256_store_pd(&q[0],q1);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1));
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4)));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_sub_pd(q1, y1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-
-	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_sub_pd(q1, x1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-        q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))));
-#endif
-	_mm256_store_pd(&q[ldq*3], q1);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-3]);
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-2]);
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]);
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]);
-
-		q1 = _mm256_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-                q1 = _mm256_NFMA_pd(x1, h1, q1);
-                q1 = _mm256_NFMA_pd(y1, h2, q1);
-                q1 = _mm256_NFMA_pd(z1, h3, q1);
-                q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))));
-#endif
-		_mm256_store_pd(&q[i*ldq],q1);
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_NFMA_pd(x1, h1, q1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-        q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1)));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-        q1 = _mm256_NFMA_pd(x1, h1, q1);
-        q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1)));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-}
-
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1770 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-#ifdef HAVE_AVX2
-
-#ifdef __FMA4__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
-#define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c)
-#define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c)
-#endif
-
-#ifdef __AVX2__
-#define __ELPA_USE_FMA__
-#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
-#define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c)
-#define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c)
-#endif
-
-#endif
-
-//Forward declaration
-static void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
-static void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
-
-/*
-!f>#ifdef HAVE_AVX
-!f> interface
-!f>   subroutine hexa_hh_trafo_real_avx_avx2_6hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="hexa_hh_trafo_real_avx_avx2_6hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
-!f>     real(kind=c_double)     :: q(*)
-!f>     real(kind=c_double)     :: hh(pnb,6)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void hexa_hh_trafo_real_avx_avx2_6hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-
-void hexa_hh_trafo_real_avx_avx2_6hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 6 householder vectors simultaneously
-	double scalarprods[15];
-
-//	scalarprods[0] = s_1_2;
-//	scalarprods[1] = s_1_3;
-//	scalarprods[2] = s_2_3;
-//	scalarprods[3] = s_1_4;
-//	scalarprods[4] = s_2_4;
-//	scalarprods[5] = s_3_4;
-//	scalarprods[6] = s_1_5;
-//	scalarprods[7] = s_2_5;
-//	scalarprods[8] = s_3_5;
-//	scalarprods[9] = s_4_5;
-//	scalarprods[10] = s_1_6;
-//	scalarprods[11] = s_2_6;
-//	scalarprods[12] = s_3_6;
-//	scalarprods[13] = s_4_6;
-//	scalarprods[14] = s_5_6;
-
-	scalarprods[0] = hh[(ldh+1)];
-	scalarprods[1] = hh[(ldh*2)+2];
-	scalarprods[2] = hh[(ldh*2)+1];
-	scalarprods[3] = hh[(ldh*3)+3];
-	scalarprods[4] = hh[(ldh*3)+2];
-	scalarprods[5] = hh[(ldh*3)+1];
-	scalarprods[6] = hh[(ldh*4)+4];
-	scalarprods[7] = hh[(ldh*4)+3];
-	scalarprods[8] = hh[(ldh*4)+2];
-	scalarprods[9] = hh[(ldh*4)+1];
-	scalarprods[10] = hh[(ldh*5)+5];
-	scalarprods[11] = hh[(ldh*5)+4];
-	scalarprods[12] = hh[(ldh*5)+3];
-	scalarprods[13] = hh[(ldh*5)+2];
-	scalarprods[14] = hh[(ldh*5)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	scalarprods[0] += hh[1] * hh[(2+ldh)];
-	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
-
-	// loop counter = 3
-	scalarprods[0] += hh[2] * hh[(3+ldh)];
-	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
-
-	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
-
-	// loop counter = 4
-	scalarprods[0] += hh[3] * hh[(4+ldh)];
-	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
-
-	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
-
-	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
-
-	// loop counter = 5
-	scalarprods[0] += hh[4] * hh[(5+ldh)];
-	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
-
-	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
-
-	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
-
-	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
-	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
-
-	#pragma ivdep
-	for (i = 6; i < nb; i++)
-	{
-		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
-		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
-		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
-
-		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
-		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
-		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
-
-		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
-		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
-		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
-
-		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
-		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
-
-		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
-	}
-
-//	printf("s_1_2: %f\n", scalarprods[0]);
-//	printf("s_1_3: %f\n", scalarprods[1]);
-//	printf("s_2_3: %f\n", scalarprods[2]);
-//	printf("s_1_4: %f\n", scalarprods[3]);
-//	printf("s_2_4: %f\n", scalarprods[4]);
-//	printf("s_3_4: %f\n", scalarprods[5]);
-//	printf("s_1_5: %f\n", scalarprods[6]);
-//	printf("s_2_5: %f\n", scalarprods[7]);
-//	printf("s_3_5: %f\n", scalarprods[8]);
-//	printf("s_4_5: %f\n", scalarprods[9]);
-//	printf("s_1_6: %f\n", scalarprods[10]);
-//	printf("s_2_6: %f\n", scalarprods[11]);
-//	printf("s_3_6: %f\n", scalarprods[12]);
-//	printf("s_4_6: %f\n", scalarprods[13]);
-//	printf("s_5_6: %f\n", scalarprods[14]);
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq-4; i+=8)
-	{
-		hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		hh_trafo_kernel_4_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#else
-	for (i = 0; i < nq-2; i+=4)
-	{
-		hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		hh_trafo_kernel_2_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#endif
-}
-
-#if 0
-void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 6 householder vectors simultaneously
-	double scalarprods[15];
-
-//	scalarprods[0] = s_1_2;
-//	scalarprods[1] = s_1_3;
-//	scalarprods[2] = s_2_3;
-//	scalarprods[3] = s_1_4;
-//	scalarprods[4] = s_2_4;
-//	scalarprods[5] = s_3_4;
-//	scalarprods[6] = s_1_5;
-//	scalarprods[7] = s_2_5;
-//	scalarprods[8] = s_3_5;
-//	scalarprods[9] = s_4_5;
-//	scalarprods[10] = s_1_6;
-//	scalarprods[11] = s_2_6;
-//	scalarprods[12] = s_3_6;
-//	scalarprods[13] = s_4_6;
-//	scalarprods[14] = s_5_6;
-
-	scalarprods[0] = hh[(ldh+1)];
-	scalarprods[1] = hh[(ldh*2)+2];
-	scalarprods[2] = hh[(ldh*2)+1];
-	scalarprods[3] = hh[(ldh*3)+3];
-	scalarprods[4] = hh[(ldh*3)+2];
-	scalarprods[5] = hh[(ldh*3)+1];
-	scalarprods[6] = hh[(ldh*4)+4];
-	scalarprods[7] = hh[(ldh*4)+3];
-	scalarprods[8] = hh[(ldh*4)+2];
-	scalarprods[9] = hh[(ldh*4)+1];
-	scalarprods[10] = hh[(ldh*5)+5];
-	scalarprods[11] = hh[(ldh*5)+4];
-	scalarprods[12] = hh[(ldh*5)+3];
-	scalarprods[13] = hh[(ldh*5)+2];
-	scalarprods[14] = hh[(ldh*5)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	scalarprods[0] += hh[1] * hh[(2+ldh)];
-	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
-
-	// loop counter = 3
-	scalarprods[0] += hh[2] * hh[(3+ldh)];
-	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
-
-	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
-
-	// loop counter = 4
-	scalarprods[0] += hh[3] * hh[(4+ldh)];
-	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
-
-	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
-
-	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
-
-	// loop counter = 5
-	scalarprods[0] += hh[4] * hh[(5+ldh)];
-	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
-
-	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
-
-	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
-
-	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
-	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
-
-	#pragma ivdep
-	for (i = 6; i < nb; i++)
-	{
-		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
-		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
-		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
-
-		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
-		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
-		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
-
-		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
-		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
-		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
-
-		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
-		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
-
-		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
-	}
-
-//	printf("s_1_2: %f\n", scalarprods[0]);
-//	printf("s_1_3: %f\n", scalarprods[1]);
-//	printf("s_2_3: %f\n", scalarprods[2]);
-//	printf("s_1_4: %f\n", scalarprods[3]);
-//	printf("s_2_4: %f\n", scalarprods[4]);
-//	printf("s_3_4: %f\n", scalarprods[5]);
-//	printf("s_1_5: %f\n", scalarprods[6]);
-//	printf("s_2_5: %f\n", scalarprods[7]);
-//	printf("s_3_5: %f\n", scalarprods[8]);
-//	printf("s_4_5: %f\n", scalarprods[9]);
-//	printf("s_1_6: %f\n", scalarprods[10]);
-//	printf("s_2_6: %f\n", scalarprods[11]);
-//	printf("s_3_6: %f\n", scalarprods[12]);
-//	printf("s_4_6: %f\n", scalarprods[13]);
-//	printf("s_5_6: %f\n", scalarprods[14]);
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq; i+=8)
-	{
-		hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#else
-	for (i = 0; i < nq; i+=4)
-	{
-		hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#endif
-}
-#endif
-
-/**
- * Unrolled kernel that computes
- * 8 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [8 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*5]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*4]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a4_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a5_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a6_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_6_5 = _mm256_broadcast_sd(&hh[(ldh*5)+1]);
-	__m256d h_6_4 = _mm256_broadcast_sd(&hh[(ldh*5)+2]);
-	__m256d h_6_3 = _mm256_broadcast_sd(&hh[(ldh*5)+3]);
-	__m256d h_6_2 = _mm256_broadcast_sd(&hh[(ldh*5)+4]);
-	__m256d h_6_1 = _mm256_broadcast_sd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d t1 = _mm256_FMA_pd(a5_1, h_6_5, a6_1);
-	t1 = _mm256_FMA_pd(a4_1, h_6_4, t1);
-	t1 = _mm256_FMA_pd(a3_1, h_6_3, t1);
-	t1 = _mm256_FMA_pd(a2_1, h_6_2, t1);
-	t1 = _mm256_FMA_pd(a1_1, h_6_1, t1);
-#else
-	register __m256d t1 = _mm256_add_pd(a6_1, _mm256_mul_pd(a5_1, h_6_5));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a4_1, h_6_4));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a3_1, h_6_3));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a2_1, h_6_2));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a1_1, h_6_1));
-#endif
-	__m256d h_5_4 = _mm256_broadcast_sd(&hh[(ldh*4)+1]);
-	__m256d h_5_3 = _mm256_broadcast_sd(&hh[(ldh*4)+2]);
-	__m256d h_5_2 = _mm256_broadcast_sd(&hh[(ldh*4)+3]);
-	__m256d h_5_1 = _mm256_broadcast_sd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d v1 = _mm256_FMA_pd(a4_1, h_5_4, a5_1);
-	v1 = _mm256_FMA_pd(a3_1, h_5_3, v1);
-	v1 = _mm256_FMA_pd(a2_1, h_5_2, v1);
-	v1 = _mm256_FMA_pd(a1_1, h_5_1, v1);
-#else
-	register __m256d v1 = _mm256_add_pd(a5_1, _mm256_mul_pd(a4_1, h_5_4));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a3_1, h_5_3));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a2_1, h_5_2));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a1_1, h_5_1));
-#endif
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-#else
-	register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-#endif
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-#else
-	register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-#endif
-	register __m256d x1 = a1_1;
-
-
-	__m256d a1_2 = _mm256_load_pd(&q[(ldq*5)+4]);
-	__m256d a2_2 = _mm256_load_pd(&q[(ldq*4)+4]);
-	__m256d a3_2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	__m256d a4_2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	__m256d a5_2 = _mm256_load_pd(&q[(ldq)+4]);
-	__m256d a6_2 = _mm256_load_pd(&q[4]);
-
-#ifdef __ELPA_USE_FMA__
-	register __m256d t2 = _mm256_FMA_pd(a5_2, h_6_5, a6_2);
-	t2 = _mm256_FMA_pd(a4_2, h_6_4, t2);
-	t2 = _mm256_FMA_pd(a3_2, h_6_3, t2);
-	t2 = _mm256_FMA_pd(a2_2, h_6_2, t2);
-	t2 = _mm256_FMA_pd(a1_2, h_6_1, t2);
-	register __m256d v2 = _mm256_FMA_pd(a4_2, h_5_4, a5_2);
-	v2 = _mm256_FMA_pd(a3_2, h_5_3, v2);
-	v2 = _mm256_FMA_pd(a2_2, h_5_2, v2);
-	v2 = _mm256_FMA_pd(a1_2, h_5_1, v2);
-	register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2);
-	w2 = _mm256_FMA_pd(a2_2, h_4_2, w2);
-	w2 = _mm256_FMA_pd(a1_2, h_4_1, w2);
-	register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2);
-	z2 = _mm256_FMA_pd(a1_2, h_3_1, z2);
-	register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2);
-#else
-	register __m256d t2 = _mm256_add_pd(a6_2, _mm256_mul_pd(a5_2, h_6_5));
-	t2 = _mm256_add_pd(t2, _mm256_mul_pd(a4_2, h_6_4));
-	t2 = _mm256_add_pd(t2, _mm256_mul_pd(a3_2, h_6_3));
-	t2 = _mm256_add_pd(t2, _mm256_mul_pd(a2_2, h_6_2));
-	t2 = _mm256_add_pd(t2, _mm256_mul_pd(a1_2, h_6_1));
-	register __m256d v2 = _mm256_add_pd(a5_2, _mm256_mul_pd(a4_2, h_5_4));
-	v2 = _mm256_add_pd(v2, _mm256_mul_pd(a3_2, h_5_3));
-	v2 = _mm256_add_pd(v2, _mm256_mul_pd(a2_2, h_5_2));
-	v2 = _mm256_add_pd(v2, _mm256_mul_pd(a1_2, h_5_1));
-	register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1));
-	register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1));
-	register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1));
-#endif
-	register __m256d x2 = a1_2;
-
-	__m256d q1;
-	__m256d q2;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-	__m256d h5;
-	__m256d h6;
-
-	for(i = 6; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-5]);
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-		x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-		y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-		z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-		z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-		w2 = _mm256_FMA_pd(q2, h4, w2);
-#else
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-		w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-#endif
-		h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		v1 = _mm256_FMA_pd(q1, h5, v1);
-		v2 = _mm256_FMA_pd(q2, h5, v2);
-#else
-		v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5));
-		v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5));
-#endif
-		h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		t1 = _mm256_FMA_pd(q1, h6, t1);
-		t2 = _mm256_FMA_pd(q2, h6, t2);
-#else
-		t1 = _mm256_add_pd(t1, _mm256_mul_pd(q1,h6));
-		t2 = _mm256_add_pd(t2, _mm256_mul_pd(q2,h6));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-5]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMA_pd(q1, h4, w1);
-	w2 = _mm256_FMA_pd(q2, h4, w2);
-#else
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm256_FMA_pd(q1, h5, v1);
-	v2 = _mm256_FMA_pd(q2, h5, v2);
-#else
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5));
-	v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-4]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMA_pd(q1, h4, w1);
-	w2 = _mm256_FMA_pd(q2, h4, w2);
-#else
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-	w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-	z2 = _mm256_FMA_pd(q2, h3, z2);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-	z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+3)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-	y2 = _mm256_FMA_pd(q2, h2, y2);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-	y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+4)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-	x2 = _mm256_FMA_pd(q2, h1, x2);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Apply tau, correct wrong calculation using pre-calculated scalar products
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-	x1 = _mm256_mul_pd(x1, tau1);
-	x2 = _mm256_mul_pd(x2, tau1);
-
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs_1_2 = _mm256_broadcast_sd(&scalarprods[0]);
-	h2 = _mm256_mul_pd(tau2, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, tau2, _mm256_mul_pd(x1,h2));
-	y2 = _mm256_FMSUB_pd(y2, tau2, _mm256_mul_pd(x2,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,tau2), _mm256_mul_pd(x1,h2));
-	y2 = _mm256_sub_pd(_mm256_mul_pd(y2,tau2), _mm256_mul_pd(x2,h2));
-#endif
-
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&scalarprods[1]);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&scalarprods[2]);
-	h2 = _mm256_mul_pd(tau3, vs_1_3);
-	h3 = _mm256_mul_pd(tau3, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, tau3, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_FMSUB_pd(z2, tau3, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,tau3), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-	z2 = _mm256_sub_pd(_mm256_mul_pd(z2,tau3), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)));
-#endif
-
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&scalarprods[3]);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&scalarprods[4]);
-	h2 = _mm256_mul_pd(tau4, vs_1_4);
-	h3 = _mm256_mul_pd(tau4, vs_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&scalarprods[5]);
-	h4 = _mm256_mul_pd(tau4, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, tau4, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_FMSUB_pd(w2, tau4, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,tau4), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-	w2 = _mm256_sub_pd(_mm256_mul_pd(w2,tau4), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
-#endif
-
-	__m256d tau5 = _mm256_broadcast_sd(&hh[ldh*4]);
-	__m256d vs_1_5 = _mm256_broadcast_sd(&scalarprods[6]);
-	__m256d vs_2_5 = _mm256_broadcast_sd(&scalarprods[7]);
-	h2 = _mm256_mul_pd(tau5, vs_1_5);
-	h3 = _mm256_mul_pd(tau5, vs_2_5);
-	__m256d vs_3_5 = _mm256_broadcast_sd(&scalarprods[8]);
-	__m256d vs_4_5 = _mm256_broadcast_sd(&scalarprods[9]);
-	h4 = _mm256_mul_pd(tau5, vs_3_5);
-	h5 = _mm256_mul_pd(tau5, vs_4_5);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm256_FMSUB_pd(v1, tau5, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-	v2 = _mm256_FMSUB_pd(v2, tau5, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))));
-#else
-	v1 = _mm256_sub_pd(_mm256_mul_pd(v1,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-	v2 = _mm256_sub_pd(_mm256_mul_pd(v2,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))));
-#endif
-
-	__m256d tau6 = _mm256_broadcast_sd(&hh[ldh*5]);
-	__m256d vs_1_6 = _mm256_broadcast_sd(&scalarprods[10]);
-	__m256d vs_2_6 = _mm256_broadcast_sd(&scalarprods[11]);
-	h2 = _mm256_mul_pd(tau6, vs_1_6);
-	h3 = _mm256_mul_pd(tau6, vs_2_6);
-	__m256d vs_3_6 = _mm256_broadcast_sd(&scalarprods[12]);
-	__m256d vs_4_6 = _mm256_broadcast_sd(&scalarprods[13]);
-	__m256d vs_5_6 = _mm256_broadcast_sd(&scalarprods[14]);
-	h4 = _mm256_mul_pd(tau6, vs_3_6);
-	h5 = _mm256_mul_pd(tau6, vs_4_6);
-	h6 = _mm256_mul_pd(tau6, vs_5_6);
-#ifdef __ELPA_USE_FMA__
-	t1 = _mm256_FMSUB_pd(t1, tau6, _mm256_FMA_pd(v1, h6, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))));
-	t2 = _mm256_FMSUB_pd(t2, tau6, _mm256_FMA_pd(v2, h6, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))));
-#else
-	t1 = _mm256_sub_pd(_mm256_mul_pd(t1,tau6), _mm256_add_pd( _mm256_mul_pd(v1,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))));
-	t2 = _mm256_sub_pd(_mm256_mul_pd(t2,tau6), _mm256_add_pd( _mm256_mul_pd(v2,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [8 x nb+3]
-	/////////////////////////////////////////////////////
-
-	q1 = _mm256_load_pd(&q[0]);
-	q2 = _mm256_load_pd(&q[4]);
-	q1 = _mm256_sub_pd(q1, t1);
-	q2 = _mm256_sub_pd(q2, t2);
-	_mm256_store_pd(&q[0],q1);
-	_mm256_store_pd(&q[4],q2);
-
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-	q2 = _mm256_load_pd(&q[(ldq+4)]);
-	q1 = _mm256_sub_pd(q1, v1);
-	q2 = _mm256_sub_pd(q2, v2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-	_mm256_store_pd(&q[(ldq+4)],q2);
-
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+1]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-	q2 = _mm256_load_pd(&q[(ldq*2)+4]);
-	q1 = _mm256_sub_pd(q1, w1);
-	q2 = _mm256_sub_pd(q2, w2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-	_mm256_store_pd(&q[(ldq*2)+4],q2);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-	q2 = _mm256_load_pd(&q[(ldq*3)+4]);
-	q1 = _mm256_sub_pd(q1, z1);
-	q2 = _mm256_sub_pd(q2, z2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq*3],q1);
-	_mm256_store_pd(&q[(ldq*3)+4],q2);
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	q1 = _mm256_load_pd(&q[ldq*4]);
-	q2 = _mm256_load_pd(&q[(ldq*4)+4]);
-	q1 = _mm256_sub_pd(q1, y1);
-	q2 = _mm256_sub_pd(q2, y2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq*4],q1);
-	_mm256_store_pd(&q[(ldq*4)+4],q2);
-
-	h2 = _mm256_broadcast_sd(&hh[(ldh)+1]);
-	q1 = _mm256_load_pd(&q[ldq*5]);
-	q2 = _mm256_load_pd(&q[(ldq*5)+4]);
-	q1 = _mm256_sub_pd(q1, x1);
-	q2 = _mm256_sub_pd(q2, x2);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-	q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-	_mm256_store_pd(&q[ldq*5],q1);
-	_mm256_store_pd(&q[(ldq*5)+4],q2);
-
-	for (i = 6; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
-		h1 = _mm256_broadcast_sd(&hh[i-5]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(x1, h1, q1);
-		q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(y1, h2, q1);
-		q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(z1, h3, q1);
-		q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(w1, h4, q1);
-		q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-		h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(v1, h5, q1);
-		q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-		h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(t1, h6, q1);
-		q2 = _mm256_NFMA_pd(t2, h6, q2);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-		q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6));
-#endif
-		_mm256_store_pd(&q[i*ldq],q1);
-		_mm256_store_pd(&q[(i*ldq)+4],q2);
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-5]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-	q2 = _mm256_NFMA_pd(v2, h5, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-	_mm256_store_pd(&q[(nb*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-4]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-	q2 = _mm256_NFMA_pd(w2, h4, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-	_mm256_store_pd(&q[((nb+1)*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-	q2 = _mm256_NFMA_pd(z2, h3, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-	_mm256_store_pd(&q[((nb+2)*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+3)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-	q2 = _mm256_NFMA_pd(y2, h2, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2));
-#endif
-	_mm256_store_pd(&q[(nb+3)*ldq],q1);
-	_mm256_store_pd(&q[((nb+3)*ldq)+4],q2);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+4)*ldq]);
-	q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-	q2 = _mm256_NFMA_pd(x2, h1, q2);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-	q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1));
-#endif
-	_mm256_store_pd(&q[(nb+4)*ldq],q1);
-	_mm256_store_pd(&q[((nb+4)*ldq)+4],q2);
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [8 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m256d a1_1 = _mm256_load_pd(&q[ldq*5]);
-	__m256d a2_1 = _mm256_load_pd(&q[ldq*4]);
-	__m256d a3_1 = _mm256_load_pd(&q[ldq*3]);
-	__m256d a4_1 = _mm256_load_pd(&q[ldq*2]);
-	__m256d a5_1 = _mm256_load_pd(&q[ldq]);
-	__m256d a6_1 = _mm256_load_pd(&q[0]);
-
-	__m256d h_6_5 = _mm256_broadcast_sd(&hh[(ldh*5)+1]);
-	__m256d h_6_4 = _mm256_broadcast_sd(&hh[(ldh*5)+2]);
-	__m256d h_6_3 = _mm256_broadcast_sd(&hh[(ldh*5)+3]);
-	__m256d h_6_2 = _mm256_broadcast_sd(&hh[(ldh*5)+4]);
-	__m256d h_6_1 = _mm256_broadcast_sd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d t1 = _mm256_FMA_pd(a5_1, h_6_5, a6_1);
-	t1 = _mm256_FMA_pd(a4_1, h_6_4, t1);
-	t1 = _mm256_FMA_pd(a3_1, h_6_3, t1);
-	t1 = _mm256_FMA_pd(a2_1, h_6_2, t1);
-	t1 = _mm256_FMA_pd(a1_1, h_6_1, t1);
-#else
-	register __m256d t1 = _mm256_add_pd(a6_1, _mm256_mul_pd(a5_1, h_6_5));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a4_1, h_6_4));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a3_1, h_6_3));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a2_1, h_6_2));
-	t1 = _mm256_add_pd(t1, _mm256_mul_pd(a1_1, h_6_1));
-#endif
-	__m256d h_5_4 = _mm256_broadcast_sd(&hh[(ldh*4)+1]);
-	__m256d h_5_3 = _mm256_broadcast_sd(&hh[(ldh*4)+2]);
-	__m256d h_5_2 = _mm256_broadcast_sd(&hh[(ldh*4)+3]);
-	__m256d h_5_1 = _mm256_broadcast_sd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d v1 = _mm256_FMA_pd(a4_1, h_5_4, a5_1);
-	v1 = _mm256_FMA_pd(a3_1, h_5_3, v1);
-	v1 = _mm256_FMA_pd(a2_1, h_5_2, v1);
-	v1 = _mm256_FMA_pd(a1_1, h_5_1, v1);
-#else
-	register __m256d v1 = _mm256_add_pd(a5_1, _mm256_mul_pd(a4_1, h_5_4));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a3_1, h_5_3));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a2_1, h_5_2));
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(a1_1, h_5_1));
-#endif
-	__m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	__m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-	__m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1);
-	w1 = _mm256_FMA_pd(a2_1, h_4_2, w1);
-	w1 = _mm256_FMA_pd(a1_1, h_4_1, w1);
-#else
-	register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2));
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1));
-#endif
-	__m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]);
-	__m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	__m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1);
-	z1 = _mm256_FMA_pd(a1_1, h_3_1, z1);
-	register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1);
-#else
-	register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2));
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1));
-	register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1));
-#endif
-	register __m256d x1 = a1_1;
-
-	__m256d q1;
-
-	__m256d h1;
-	__m256d h2;
-	__m256d h3;
-	__m256d h4;
-	__m256d h5;
-	__m256d h6;
-
-	for(i = 6; i < nb; i++)
-	{
-		h1 = _mm256_broadcast_sd(&hh[i-5]);
-		q1 = _mm256_load_pd(&q[i*ldq]);
-#ifdef __ELPA_USE_FMA__
-		x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-		z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-		w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-		h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		v1 = _mm256_FMA_pd(q1, h5, v1);
-#else
-		v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5));
-#endif
-		h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		t1 = _mm256_FMA_pd(q1, h6, t1);
-#else
-		t1 = _mm256_add_pd(t1, _mm256_mul_pd(q1,h6));
-#endif
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-5]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm256_FMA_pd(q1, h5, v1);
-#else
-	v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-4]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMA_pd(q1, h4, w1);
-#else
-	w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMA_pd(q1, h3, z1);
-#else
-	z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+3)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMA_pd(q1, h2, y1);
-#else
-	y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
-#endif
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+4)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	x1 = _mm256_FMA_pd(q1, h1, x1);
-#else
-	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Apply tau, correct wrong calculation using pre-calculated scalar products
-	/////////////////////////////////////////////////////
-
-	__m256d tau1 = _mm256_broadcast_sd(&hh[0]);
-	x1 = _mm256_mul_pd(x1, tau1);
-
-	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
-	__m256d vs_1_2 = _mm256_broadcast_sd(&scalarprods[0]);
-	h2 = _mm256_mul_pd(tau2, vs_1_2);
-#ifdef __ELPA_USE_FMA__
-	y1 = _mm256_FMSUB_pd(y1, tau2, _mm256_mul_pd(x1,h2));
-#else
-	y1 = _mm256_sub_pd(_mm256_mul_pd(y1,tau2), _mm256_mul_pd(x1,h2));
-#endif
-
-	__m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]);
-	__m256d vs_1_3 = _mm256_broadcast_sd(&scalarprods[1]);
-	__m256d vs_2_3 = _mm256_broadcast_sd(&scalarprods[2]);
-	h2 = _mm256_mul_pd(tau3, vs_1_3);
-	h3 = _mm256_mul_pd(tau3, vs_2_3);
-#ifdef __ELPA_USE_FMA__
-	z1 = _mm256_FMSUB_pd(z1, tau3, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)));
-#else
-	z1 = _mm256_sub_pd(_mm256_mul_pd(z1,tau3), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)));
-#endif
-
-	__m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]);
-	__m256d vs_1_4 = _mm256_broadcast_sd(&scalarprods[3]);
-	__m256d vs_2_4 = _mm256_broadcast_sd(&scalarprods[4]);
-	h2 = _mm256_mul_pd(tau4, vs_1_4);
-	h3 = _mm256_mul_pd(tau4, vs_2_4);
-	__m256d vs_3_4 = _mm256_broadcast_sd(&scalarprods[5]);
-	h4 = _mm256_mul_pd(tau4, vs_3_4);
-#ifdef __ELPA_USE_FMA__
-	w1 = _mm256_FMSUB_pd(w1, tau4, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-#else
-	w1 = _mm256_sub_pd(_mm256_mul_pd(w1,tau4), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-#endif
-
-	__m256d tau5 = _mm256_broadcast_sd(&hh[ldh*4]);
-	__m256d vs_1_5 = _mm256_broadcast_sd(&scalarprods[6]);
-	__m256d vs_2_5 = _mm256_broadcast_sd(&scalarprods[7]);
-	h2 = _mm256_mul_pd(tau5, vs_1_5);
-	h3 = _mm256_mul_pd(tau5, vs_2_5);
-	__m256d vs_3_5 = _mm256_broadcast_sd(&scalarprods[8]);
-	__m256d vs_4_5 = _mm256_broadcast_sd(&scalarprods[9]);
-	h4 = _mm256_mul_pd(tau5, vs_3_5);
-	h5 = _mm256_mul_pd(tau5, vs_4_5);
-#ifdef __ELPA_USE_FMA__
-	v1 = _mm256_FMSUB_pd(v1, tau5, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))));
-#else
-	v1 = _mm256_sub_pd(_mm256_mul_pd(v1,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))));
-#endif
-
-	__m256d tau6 = _mm256_broadcast_sd(&hh[ldh*5]);
-	__m256d vs_1_6 = _mm256_broadcast_sd(&scalarprods[10]);
-	__m256d vs_2_6 = _mm256_broadcast_sd(&scalarprods[11]);
-	h2 = _mm256_mul_pd(tau6, vs_1_6);
-	h3 = _mm256_mul_pd(tau6, vs_2_6);
-	__m256d vs_3_6 = _mm256_broadcast_sd(&scalarprods[12]);
-	__m256d vs_4_6 = _mm256_broadcast_sd(&scalarprods[13]);
-	__m256d vs_5_6 = _mm256_broadcast_sd(&scalarprods[14]);
-	h4 = _mm256_mul_pd(tau6, vs_3_6);
-	h5 = _mm256_mul_pd(tau6, vs_4_6);
-	h6 = _mm256_mul_pd(tau6, vs_5_6);
-#ifdef __ELPA_USE_FMA__
-	t1 = _mm256_FMSUB_pd(t1, tau6, _mm256_FMA_pd(v1, h6, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))));
-#else
-	t1 = _mm256_sub_pd(_mm256_mul_pd(t1,tau6), _mm256_add_pd( _mm256_mul_pd(v1,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))));
-#endif
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [4 x nb+3]
-	/////////////////////////////////////////////////////
-
-	q1 = _mm256_load_pd(&q[0]);
-	q1 = _mm256_sub_pd(q1, t1);
-	_mm256_store_pd(&q[0],q1);
-
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+1]);
-	q1 = _mm256_load_pd(&q[ldq]);
-	q1 = _mm256_sub_pd(q1, v1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq],q1);
-
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+1]);
-	q1 = _mm256_load_pd(&q[ldq*2]);
-	q1 = _mm256_sub_pd(q1, w1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq*2],q1);
-
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]);
-	q1 = _mm256_load_pd(&q[ldq*3]);
-	q1 = _mm256_sub_pd(q1, z1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq*3],q1);
-
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]);
-	q1 = _mm256_load_pd(&q[ldq*4]);
-	q1 = _mm256_sub_pd(q1, y1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq*4],q1);
-
-	h2 = _mm256_broadcast_sd(&hh[(ldh)+1]);
-	q1 = _mm256_load_pd(&q[ldq*5]);
-	q1 = _mm256_sub_pd(q1, x1);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	h6 = _mm256_broadcast_sd(&hh[(ldh*5)+5]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-	_mm256_store_pd(&q[ldq*5],q1);
-
-	for (i = 6; i < nb; i++)
-	{
-		q1 = _mm256_load_pd(&q[i*ldq]);
-		h1 = _mm256_broadcast_sd(&hh[i-5]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-		h2 = _mm256_broadcast_sd(&hh[ldh+i-4]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-		h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-		h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-		h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-		h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]);
-#ifdef __ELPA_USE_FMA__
-		q1 = _mm256_NFMA_pd(t1, h6, q1);
-#else
-		q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6));
-#endif
-		_mm256_store_pd(&q[i*ldq],q1);
-	}
-
-	h1 = _mm256_broadcast_sd(&hh[nb-5]);
-	q1 = _mm256_load_pd(&q[nb*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(v1, h5, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5));
-#endif
-	_mm256_store_pd(&q[nb*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-4]);
-	q1 = _mm256_load_pd(&q[(nb+1)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(w1, h4, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4));
-#endif
-	_mm256_store_pd(&q[(nb+1)*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-3]);
-	q1 = _mm256_load_pd(&q[(nb+2)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(z1, h3, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3));
-#endif
-	_mm256_store_pd(&q[(nb+2)*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-2]);
-	q1 = _mm256_load_pd(&q[(nb+3)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(y1, h2, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2));
-#endif
-	_mm256_store_pd(&q[(nb+3)*ldq],q1);
-
-	h1 = _mm256_broadcast_sd(&hh[nb-1]);
-	q1 = _mm256_load_pd(&q[(nb+4)*ldq]);
-#ifdef __ELPA_USE_FMA__
-	q1 = _mm256_NFMA_pd(x1, h1, q1);
-#else
-	q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1));
-#endif
-	_mm256_store_pd(&q[(nb+4)*ldq],q1);
-}
-
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_bgp.f90 elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_bgp.f90
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_bgp.f90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_bgp.f90	1970-01-01 00:00:00.000000000 +0000
@@ -1,799 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-!
-! *** Special IBM BlueGene/P version with BlueGene assembler instructions in Fortran ***
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-!module real_bgp_kernel
-
-!  private
-!  public double_hh_trafo_bgp
-!contains
-  subroutine double_hh_trafo_bgp(q, hh, nb, nq, ldq, ldh)
-    use precision
-
-    implicit none
-
-    integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*)
-
-    real(kind=rk)                :: s
-    integer(kind=ik)             :: i
-
-    ! Safety only:
-
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-    if(mod(loc(q),16) /= 0) STOP 'Q unaligned!'
-
-    ! Calculate dot product of the two Householder vectors
-
-    s = hh(2,2)*1
-    do i=3,nb
-       s = s+hh(i,2)*hh(i-1,1)
-    enddo
-
-    do i=1,nq-16,20
-       call hh_trafo_kernel_10_bgp(q(i   ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_10_bgp(q(i+10,1), hh, nb, ldq, ldh, s)
-    enddo
-
-    ! i > nq-16 now, i.e. at most 16 rows remain
-
-    if(nq-i+1 > 12) then
-       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_8_bgp(q(i+8,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 8) then
-       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_4_bgp(q(i+8,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 4) then
-       call hh_trafo_kernel_8_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 0) then
-       call hh_trafo_kernel_4_bgp(q(i  ,1), hh, nb, ldq, ldh, s)
-    endif
-
-  end subroutine double_hh_trafo_bgp
-
-  ! --------------------------------------------------------------------------------------------------
-  ! The following kernels perform the Householder transformation on Q for 10/8/4 rows.
-  ! Please note that Q is declared complex*16 here.
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_10_bgp(q, hh, nb, ldq, ldh, s)
-
-    use precision
-    use elpa_mpi
-    implicit none
-
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*), s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, y1, y2, y3, y4, y5, q1, q2, q3, q4, q5, p1, p2, p3, p4, p5
-    real(kind=rk)                   :: h1, h2
-    integer(kind=ik)                :: i
-
-    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
-    !   real*8 x
-    !   loadfp(a) = a
-    !   fxcpmadd(a,b,x) = a + b*x
-    !   fxpmul(a,x) = a*x
-    !   fpadd(a,b) = a+b
-    !
-    call alignx(16,q)
-
-
-    x1 = loadfp(q(1,2))
-    x2 = loadfp(q(2,2))
-    x3 = loadfp(q(3,2))
-    x4 = loadfp(q(4,2))
-    x5 = loadfp(q(5,2))
-
-    h2 = hh(2,2)
-    y1 = loadfp(q(1,1))
-    y2 = loadfp(q(2,1))
-    y3 = loadfp(q(3,1))
-    y4 = loadfp(q(4,1))
-    y5 = loadfp(q(5,1))
-    y1 = fxcpmadd(y1,x1,h2)
-    q1 = loadfp(q(1,3))
-    y2 = fxcpmadd(y2,x2,h2)
-    q2 = loadfp(q(2,3))
-    y3 = fxcpmadd(y3,x3,h2)
-    q3 = loadfp(q(3,3))
-    y4 = fxcpmadd(y4,x4,h2)
-    q4 = loadfp(q(4,3))
-    y5 = fxcpmadd(y5,x5,h2)
-    q5 = loadfp(q(5,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-       x3 = fxcpmadd(x3,q3,h1)
-       x4 = fxcpmadd(x4,q4,h1)
-       x5 = fxcpmadd(x5,q5,h1)
-
-       h1 = hh(i  ,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+1))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+1))
-       y3 = fxcpmadd(y3,q3,h2)
-       q3 = loadfp(q(3,i+1))
-       y4 = fxcpmadd(y4,q4,h2)
-       q4 = loadfp(q(4,i+1))
-       y5 = fxcpmadd(y5,q5,h2)
-       q5 = loadfp(q(5,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-       x3 = fxcpmadd(x3,q3,h1)
-       x4 = fxcpmadd(x4,q4,h1)
-       x5 = fxcpmadd(x5,q5,h1)
-
-       h1 = hh(i+1,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+2))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+2))
-       y3 = fxcpmadd(y3,q3,h2)
-       q3 = loadfp(q(3,i+2))
-       y4 = fxcpmadd(y4,q4,h2)
-       q4 = loadfp(q(4,i+2))
-       y5 = fxcpmadd(y5,q5,h2)
-       q5 = loadfp(q(5,i+2))
-
-    enddo
-
-    x1 = fxcpmadd(x1,q1,h1)
-    x2 = fxcpmadd(x2,q2,h1)
-    x3 = fxcpmadd(x3,q3,h1)
-    x4 = fxcpmadd(x4,q4,h1)
-    x5 = fxcpmadd(x5,q5,h1)
-
-    h1 = -hh(1,1) ! for below
-    h2 = -hh(1,2)
-    x1 = fxpmul(x1,h1)
-    x2 = fxpmul(x2,h1)
-    x3 = fxpmul(x3,h1)
-    x4 = fxpmul(x4,h1)
-    x5 = fxpmul(x5,h1)
-    h1 = -hh(1,2)*s
-    y1 = fxpmul(y1,h2)
-    y2 = fxpmul(y2,h2)
-    y3 = fxpmul(y3,h2)
-    y4 = fxpmul(y4,h2)
-    y5 = fxpmul(y5,h2)
-    y1 = fxcpmadd(y1,x1,h1)
-    q1 = loadfp(q(1,1))
-    y2 = fxcpmadd(y2,x2,h1)
-    q2 = loadfp(q(2,1))
-    y3 = fxcpmadd(y3,x3,h1)
-    q3 = loadfp(q(3,1))
-    y4 = fxcpmadd(y4,x4,h1)
-    q4 = loadfp(q(4,1))
-    y5 = fxcpmadd(y5,x5,h1)
-    q5 = loadfp(q(5,1))
-
-    q1 = fpadd(q1,y1)
-    p1 = loadfp(q(1,2))
-    q2 = fpadd(q2,y2)
-    p2 = loadfp(q(2,2))
-    q3 = fpadd(q3,y3)
-    p3 = loadfp(q(3,2))
-    q4 = fpadd(q4,y4)
-    p4 = loadfp(q(4,2))
-    q5 = fpadd(q5,y5)
-    p5 = loadfp(q(5,2))
-
-    h2 = hh(2,2)
-
-    call storefp(q(1,1),q1)
-    p1 = fpadd(p1,x1)
-    call storefp(q(2,1),q2)
-    p2 = fpadd(p2,x2)
-    call storefp(q(3,1),q3)
-    p3 = fpadd(p3,x3)
-    call storefp(q(4,1),q4)
-    p4 = fpadd(p4,x4)
-    call storefp(q(5,1),q5)
-    p5 = fpadd(p5,x5)
-
-    p1 = fxcpmadd(p1,y1,h2)
-    q1 = loadfp(q(1,3))
-    p2 = fxcpmadd(p2,y2,h2)
-    q2 = loadfp(q(2,3))
-    p3 = fxcpmadd(p3,y3,h2)
-    q3 = loadfp(q(3,3))
-    p4 = fxcpmadd(p4,y4,h2)
-    q4 = loadfp(q(4,3))
-    p5 = fxcpmadd(p5,y5,h2)
-    q5 = loadfp(q(5,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       call storefp(q(1,i-1),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,i-1),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-       call storefp(q(3,i-1),p3)
-       q3 = fxcpmadd(q3,x3,h1)
-       call storefp(q(4,i-1),p4)
-       q4 = fxcpmadd(q4,x4,h1)
-       call storefp(q(5,i-1),p5)
-       q5 = fxcpmadd(q5,x5,h1)
-
-       h1 = hh(i,1)
-
-       q1 = fxcpmadd(q1,y1,h2)
-       p1 = loadfp(q(1,i+1))
-       q2 = fxcpmadd(q2,y2,h2)
-       p2 = loadfp(q(2,i+1))
-       q3 = fxcpmadd(q3,y3,h2)
-       p3 = loadfp(q(3,i+1))
-       q4 = fxcpmadd(q4,y4,h2)
-       p4 = loadfp(q(4,i+1))
-       q5 = fxcpmadd(q5,y5,h2)
-       p5 = loadfp(q(5,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       call storefp(q(1,i),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,i),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-       call storefp(q(3,i),q3)
-       p3 = fxcpmadd(p3,x3,h1)
-       call storefp(q(4,i),q4)
-       p4 = fxcpmadd(p4,x4,h1)
-       call storefp(q(5,i),q5)
-       p5 = fxcpmadd(p5,x5,h1)
-
-       h1 = hh(i+1,1)
-
-       p1 = fxcpmadd(p1,y1,h2)
-       q1 = loadfp(q(1,i+2))
-       p2 = fxcpmadd(p2,y2,h2)
-       q2 = loadfp(q(2,i+2))
-       p3 = fxcpmadd(p3,y3,h2)
-       q3 = loadfp(q(3,i+2))
-       p4 = fxcpmadd(p4,y4,h2)
-       q4 = loadfp(q(4,i+2))
-       p5 = fxcpmadd(p5,y5,h2)
-       q5 = loadfp(q(5,i+2))
-
-    enddo
-
-
-    if(i==nb) then
-       call storefp(q(1,nb),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,nb),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-       call storefp(q(3,nb),q3)
-       p3 = fxcpmadd(p3,x3,h1)
-       call storefp(q(4,nb),q4)
-       p4 = fxcpmadd(p4,x4,h1)
-       call storefp(q(5,nb),q5)
-       p5 = fxcpmadd(p5,x5,h1)
-
-       call storefp(q(1,nb+1),p1)
-       call storefp(q(2,nb+1),p2)
-       call storefp(q(3,nb+1),p3)
-       call storefp(q(4,nb+1),p4)
-       call storefp(q(5,nb+1),p5)
-    else
-       call storefp(q(1,nb),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,nb),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-       call storefp(q(3,nb),p3)
-       q3 = fxcpmadd(q3,x3,h1)
-       call storefp(q(4,nb),p4)
-       q4 = fxcpmadd(q4,x4,h1)
-       call storefp(q(5,nb),p5)
-       q5 = fxcpmadd(q5,x5,h1)
-
-       call storefp(q(1,nb+1),q1)
-       call storefp(q(2,nb+1),q2)
-       call storefp(q(3,nb+1),q3)
-       call storefp(q(4,nb+1),q4)
-       call storefp(q(5,nb+1),q5)
-    endif
-
-
-    !contains
-    !
-    !   subroutine storefp(a,b)
-    !      complex*16 a, b
-    !
-    !      a = b
-    !   end subroutine
-    !   subroutine alignx(n, x)
-    !      integer n
-    !      complex*16 x(ldq/2,*)
-    !   end subroutine
-
-  end subroutine hh_trafo_kernel_10_bgp
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_8_bgp(q, hh, nb, ldq, ldh, s)
-
-    use precision
-    use elpa_mpi
-    implicit none
-
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*), s
-
-    complex(kind=ck)                :: x1, x2, x3, x4, y1, y2, y3, y4, q1, q2, q3, q4, p1, p2, p3, p4
-    real(kind=rk)                   :: h1, h2
-    integer(kind=ik)                :: i
-
-    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
-    !   real*8 x
-    !   loadfp(a) = a
-    !   fxcpmadd(a,b,x) = a + b*x
-    !   fxpmul(a,x) = a*x
-    !   fpadd(a,b) = a+b
-
-    call alignx(16,q)
-
-
-    x1 = loadfp(q(1,2))
-    x2 = loadfp(q(2,2))
-    x3 = loadfp(q(3,2))
-    x4 = loadfp(q(4,2))
-
-    h2 = hh(2,2)
-    y1 = loadfp(q(1,1))
-    y2 = loadfp(q(2,1))
-    y3 = loadfp(q(3,1))
-    y4 = loadfp(q(4,1))
-    y1 = fxcpmadd(y1,x1,h2)
-    q1 = loadfp(q(1,3))
-    y2 = fxcpmadd(y2,x2,h2)
-    q2 = loadfp(q(2,3))
-    y3 = fxcpmadd(y3,x3,h2)
-    q3 = loadfp(q(3,3))
-    y4 = fxcpmadd(y4,x4,h2)
-    q4 = loadfp(q(4,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-       x3 = fxcpmadd(x3,q3,h1)
-       x4 = fxcpmadd(x4,q4,h1)
-
-       h1 = hh(i  ,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+1))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+1))
-       y3 = fxcpmadd(y3,q3,h2)
-       q3 = loadfp(q(3,i+1))
-       y4 = fxcpmadd(y4,q4,h2)
-       q4 = loadfp(q(4,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-       x3 = fxcpmadd(x3,q3,h1)
-       x4 = fxcpmadd(x4,q4,h1)
-
-       h1 = hh(i+1,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+2))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+2))
-       y3 = fxcpmadd(y3,q3,h2)
-       q3 = loadfp(q(3,i+2))
-       y4 = fxcpmadd(y4,q4,h2)
-       q4 = loadfp(q(4,i+2))
-
-    enddo
-
-    x1 = fxcpmadd(x1,q1,h1)
-    x2 = fxcpmadd(x2,q2,h1)
-    x3 = fxcpmadd(x3,q3,h1)
-    x4 = fxcpmadd(x4,q4,h1)
-
-    h1 = -hh(1,1) ! for below
-    h2 = -hh(1,2)
-    x1 = fxpmul(x1,h1)
-    x2 = fxpmul(x2,h1)
-    x3 = fxpmul(x3,h1)
-    x4 = fxpmul(x4,h1)
-    h1 = -hh(1,2)*s
-    y1 = fxpmul(y1,h2)
-    y2 = fxpmul(y2,h2)
-    y3 = fxpmul(y3,h2)
-    y4 = fxpmul(y4,h2)
-    y1 = fxcpmadd(y1,x1,h1)
-    q1 = loadfp(q(1,1))
-    y2 = fxcpmadd(y2,x2,h1)
-    q2 = loadfp(q(2,1))
-    y3 = fxcpmadd(y3,x3,h1)
-    q3 = loadfp(q(3,1))
-    y4 = fxcpmadd(y4,x4,h1)
-    q4 = loadfp(q(4,1))
-
-    q1 = fpadd(q1,y1)
-    p1 = loadfp(q(1,2))
-    q2 = fpadd(q2,y2)
-    p2 = loadfp(q(2,2))
-    q3 = fpadd(q3,y3)
-    p3 = loadfp(q(3,2))
-    q4 = fpadd(q4,y4)
-    p4 = loadfp(q(4,2))
-
-    h2 = hh(2,2)
-
-    call storefp(q(1,1),q1)
-    p1 = fpadd(p1,x1)
-    call storefp(q(2,1),q2)
-    p2 = fpadd(p2,x2)
-    call storefp(q(3,1),q3)
-    p3 = fpadd(p3,x3)
-    call storefp(q(4,1),q4)
-    p4 = fpadd(p4,x4)
-
-    p1 = fxcpmadd(p1,y1,h2)
-    q1 = loadfp(q(1,3))
-    p2 = fxcpmadd(p2,y2,h2)
-    q2 = loadfp(q(2,3))
-    p3 = fxcpmadd(p3,y3,h2)
-    q3 = loadfp(q(3,3))
-    p4 = fxcpmadd(p4,y4,h2)
-    q4 = loadfp(q(4,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       call storefp(q(1,i-1),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,i-1),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-       call storefp(q(3,i-1),p3)
-       q3 = fxcpmadd(q3,x3,h1)
-       call storefp(q(4,i-1),p4)
-       q4 = fxcpmadd(q4,x4,h1)
-
-       h1 = hh(i,1)
-
-       q1 = fxcpmadd(q1,y1,h2)
-       p1 = loadfp(q(1,i+1))
-       q2 = fxcpmadd(q2,y2,h2)
-       p2 = loadfp(q(2,i+1))
-       q3 = fxcpmadd(q3,y3,h2)
-       p3 = loadfp(q(3,i+1))
-       q4 = fxcpmadd(q4,y4,h2)
-       p4 = loadfp(q(4,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       call storefp(q(1,i),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,i),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-       call storefp(q(3,i),q3)
-       p3 = fxcpmadd(p3,x3,h1)
-       call storefp(q(4,i),q4)
-       p4 = fxcpmadd(p4,x4,h1)
-
-       h1 = hh(i+1,1)
-
-       p1 = fxcpmadd(p1,y1,h2)
-       q1 = loadfp(q(1,i+2))
-       p2 = fxcpmadd(p2,y2,h2)
-       q2 = loadfp(q(2,i+2))
-       p3 = fxcpmadd(p3,y3,h2)
-       q3 = loadfp(q(3,i+2))
-       p4 = fxcpmadd(p4,y4,h2)
-       q4 = loadfp(q(4,i+2))
-
-    enddo
-
-
-    if(i==nb) then
-       call storefp(q(1,nb),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,nb),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-       call storefp(q(3,nb),q3)
-       p3 = fxcpmadd(p3,x3,h1)
-       call storefp(q(4,nb),q4)
-       p4 = fxcpmadd(p4,x4,h1)
-
-       call storefp(q(1,nb+1),p1)
-       call storefp(q(2,nb+1),p2)
-       call storefp(q(3,nb+1),p3)
-       call storefp(q(4,nb+1),p4)
-    else
-       call storefp(q(1,nb),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,nb),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-       call storefp(q(3,nb),p3)
-       q3 = fxcpmadd(q3,x3,h1)
-       call storefp(q(4,nb),p4)
-       q4 = fxcpmadd(q4,x4,h1)
-
-       call storefp(q(1,nb+1),q1)
-       call storefp(q(2,nb+1),q2)
-       call storefp(q(3,nb+1),q3)
-       call storefp(q(4,nb+1),q4)
-    endif
-
-
-    !contains
-    !
-    !   subroutine storefp(a,b)
-    !      complex*16 a, b
-    !
-    !      a = b
-    !   end subroutine
-    !   subroutine alignx(n, x)
-    !      integer n
-    !      complex*16 x(ldq/2,*)
-    !   end subroutine
-
-  end subroutine hh_trafo_kernel_8_bgp
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_4_bgp(q, hh, nb, ldq, ldh, s)
-
-    use precision
-    use elpa_mpi
-    implicit none
-
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*), s
-
-    complex(kind=ck)                :: x1, x2, y1, y2, q1, q2, p1, p2
-    real(kind=rk)                   :: h1, h2
-    integer(kind=ik)                :: i
-
-    !   complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b
-    !   real*8 x
-    !   loadfp(a) = a
-    !   fxcpmadd(a,b,x) = a + b*x
-    !   fxpmul(a,x) = a*x
-    !   fpadd(a,b) = a+b
-
-    call alignx(16,q)
-
-
-    x1 = loadfp(q(1,2))
-    x2 = loadfp(q(2,2))
-
-    h2 = hh(2,2)
-    y1 = loadfp(q(1,1))
-    y2 = loadfp(q(2,1))
-    y1 = fxcpmadd(y1,x1,h2)
-    q1 = loadfp(q(1,3))
-    y2 = fxcpmadd(y2,x2,h2)
-    q2 = loadfp(q(2,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-
-       h1 = hh(i  ,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+1))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       x1 = fxcpmadd(x1,q1,h1)
-       x2 = fxcpmadd(x2,q2,h1)
-
-       h1 = hh(i+1,1)
-
-       y1 = fxcpmadd(y1,q1,h2)
-       q1 = loadfp(q(1,i+2))
-       y2 = fxcpmadd(y2,q2,h2)
-       q2 = loadfp(q(2,i+2))
-
-    enddo
-
-    x1 = fxcpmadd(x1,q1,h1)
-    x2 = fxcpmadd(x2,q2,h1)
-
-    h1 = -hh(1,1) ! for below
-    h2 = -hh(1,2)
-    x1 = fxpmul(x1,h1)
-    x2 = fxpmul(x2,h1)
-    h1 = -hh(1,2)*s
-    y1 = fxpmul(y1,h2)
-    y2 = fxpmul(y2,h2)
-    y1 = fxcpmadd(y1,x1,h1)
-    q1 = loadfp(q(1,1))
-    y2 = fxcpmadd(y2,x2,h1)
-    q2 = loadfp(q(2,1))
-
-    q1 = fpadd(q1,y1)
-    p1 = loadfp(q(1,2))
-    q2 = fpadd(q2,y2)
-    p2 = loadfp(q(2,2))
-
-    h2 = hh(2,2)
-
-    call storefp(q(1,1),q1)
-    p1 = fpadd(p1,x1)
-    call storefp(q(2,1),q2)
-    p2 = fpadd(p2,x2)
-
-    p1 = fxcpmadd(p1,y1,h2)
-    q1 = loadfp(q(1,3))
-    p2 = fxcpmadd(p2,y2,h2)
-    q2 = loadfp(q(2,3))
-
-    h1 = hh(3-1,1)
-
-    do i=3,nb,2
-
-       h2 = hh(i,2)
-
-       call storefp(q(1,i-1),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,i-1),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-
-       h1 = hh(i,1)
-
-       q1 = fxcpmadd(q1,y1,h2)
-       p1 = loadfp(q(1,i+1))
-       q2 = fxcpmadd(q2,y2,h2)
-       p2 = loadfp(q(2,i+1))
-
-       if(i==nb) exit
-
-       h2 = hh(i+1,2)
-
-       call storefp(q(1,i),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,i),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-
-       h1 = hh(i+1,1)
-
-       p1 = fxcpmadd(p1,y1,h2)
-       q1 = loadfp(q(1,i+2))
-       p2 = fxcpmadd(p2,y2,h2)
-       q2 = loadfp(q(2,i+2))
-
-    enddo
-
-
-    if(i==nb) then
-       call storefp(q(1,nb),q1)
-       p1 = fxcpmadd(p1,x1,h1)
-       call storefp(q(2,nb),q2)
-       p2 = fxcpmadd(p2,x2,h1)
-
-       call storefp(q(1,nb+1),p1)
-       call storefp(q(2,nb+1),p2)
-    else
-       call storefp(q(1,nb),p1)
-       q1 = fxcpmadd(q1,x1,h1)
-       call storefp(q(2,nb),p2)
-       q2 = fxcpmadd(q2,x2,h1)
-
-       call storefp(q(1,nb+1),q1)
-       call storefp(q(2,nb+1),q2)
-    endif
-
-
-    !contains
-    !
-    !   subroutine storefp(a,b)
-    !      complex*16 a, b
-    !
-    !      a = b
-    !   end subroutine
-    !   subroutine alignx(n, x)
-    !      integer n
-    !      complex*16 x(ldq/2,*)
-    !   end subroutine
-
-  end subroutine hh_trafo_kernel_4_bgp
-!end module real_bgp_kernel
-! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_bgq.f90 elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_bgq.f90
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_bgq.f90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_bgq.f90	1970-01-01 00:00:00.000000000 +0000
@@ -1,662 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-!
-! *** Special IBM BlueGene/Q version with QPX intrinsics in Fortran ***
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-module real_bgq_kernel
-
-  private
-  public double_hh_trafo_bgq
-contains
-  subroutine double_hh_trafo_bgq(q, hh, nb, nq, ldq, ldh)
-    use precision
-    implicit none
-
-    integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*)
-
-    real(kind=rk)                :: s
-    integer(kind=ik)             :: i
-
-    ! Safety only:
-
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-
-    call alignx(32,q)
-
-    ! Calculate dot product of the two Householder vectors
-
-    s = hh(2,2)*1
-    do i=3,nb
-       s = s+hh(i,2)*hh(i-1,1)
-    enddo
-
-    do i=1,nq-20,24
-       call hh_trafo_kernel_24_bgq(q(i   ,1), hh, nb, ldq, ldh, s)
-    enddo
-
-    if(nq-i+1 > 16) then
-       call hh_trafo_kernel_16_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_4_bgq(q(i+16,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 12) then
-       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_8_bgq(q(i+8,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 8) then
-       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-       call hh_trafo_kernel_4_bgq(q(i+8,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 4) then
-       call hh_trafo_kernel_8_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-    else if(nq-i+1 > 0) then
-       call hh_trafo_kernel_4_bgq(q(i  ,1), hh, nb, ldq, ldh, s)
-    endif
-
-  end subroutine double_hh_trafo_bgq
-
-
-  ! --------------------------------------------------------------------------------------------------
-  ! The following kernels perform the Householder transformation on Q for 24/16/8/4 rows.
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_24_bgq(q, hh, nb, ldq, ldh, s)
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in) :: nb, ldq, ldh
-
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*), s
-
-    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_x3, QPX_x4, QPX_x5, QPX_x6
-    VECTOR(REAL(8))::QPX_y1, QPX_y2, QPX_y3, QPX_y4, QPX_y5, QPX_y6
-    VECTOR(REAL(8))::QPX_q1, QPX_q2, QPX_q3, QPX_q4, QPX_q5, QPX_q6
-    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
-    integer i
-
-    call alignx(32,q)
-
-    !--- multiply Householder vectors with matrix q ---
-
-    QPX_x1 = VEC_LD(0,q(1,2))
-    QPX_x2 = VEC_LD(0,q(5,2))
-    QPX_x3 = VEC_LD(0,q(9,2))
-    QPX_x4 = VEC_LD(0,q(13,2))
-    QPX_x5 = VEC_LD(0,q(17,2))
-    QPX_x6 = VEC_LD(0,q(21,2))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q3 = VEC_LD(0,q(9,1))
-    QPX_q4 = VEC_LD(0,q(13,1))
-    QPX_q5 = VEC_LD(0,q(17,1))
-    QPX_q6 = VEC_LD(0,q(21,1))
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
-    QPX_y3 = VEC_MADD(QPX_x3, QPX_h2, QPX_q3)
-    QPX_y4 = VEC_MADD(QPX_x4, QPX_h2, QPX_q4)
-    QPX_y5 = VEC_MADD(QPX_x5, QPX_h2, QPX_q5)
-    QPX_y6 = VEC_MADD(QPX_x6, QPX_h2, QPX_q6)
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_q3 = VEC_LD(0,q(9,i))
-       QPX_q4 = VEC_LD(0,q(13,i))
-       QPX_q5 = VEC_LD(0,q(17,i))
-       QPX_q6 = VEC_LD(0,q(21,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-       QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
-       QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
-       QPX_x5 = VEC_MADD(QPX_q5, QPX_h1, QPX_x5)
-       QPX_x6 = VEC_MADD(QPX_q6, QPX_h1, QPX_x6)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
-       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
-       QPX_y3 = VEC_MADD(QPX_q3, QPX_h2, QPX_y3)
-       QPX_y4 = VEC_MADD(QPX_q4, QPX_h2, QPX_y4)
-       QPX_y5 = VEC_MADD(QPX_q5, QPX_h2, QPX_y5)
-       QPX_y6 = VEC_MADD(QPX_q6, QPX_h2, QPX_y6)
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q3 = VEC_LD(0,q(9,nb+1))
-    QPX_q4 = VEC_LD(0,q(13,nb+1))
-    QPX_q5 = VEC_LD(0,q(17,nb+1))
-    QPX_q6 = VEC_LD(0,q(21,nb+1))
-    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-    QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
-    QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
-    QPX_x5 = VEC_MADD(QPX_q5, QPX_h1, QPX_x5)
-    QPX_x6 = VEC_MADD(QPX_q6, QPX_h1, QPX_x6)
-
-    !--- multiply T matrix ---
-
-    QPX_tau1 = VEC_SPLATS(-hh(1,1))
-    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
-    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
-    QPX_x3 = VEC_MUL(QPX_x3, QPX_tau1)
-    QPX_x4 = VEC_MUL(QPX_x4, QPX_tau1)
-    QPX_x5 = VEC_MUL(QPX_x5, QPX_tau1)
-    QPX_x6 = VEC_MUL(QPX_x6, QPX_tau1)
-    QPX_tau2 = VEC_SPLATS(-hh(1,2))
-    QPX_s = VEC_SPLATS(-hh(1,2)*s)
-    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
-    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
-    QPX_y3 = VEC_MUL(QPX_y3, QPX_tau2)
-    QPX_y4 = VEC_MUL(QPX_y4, QPX_tau2)
-    QPX_y5 = VEC_MUL(QPX_y5, QPX_tau2)
-    QPX_y6 = VEC_MUL(QPX_y6, QPX_tau2)
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
-    QPX_y3 = VEC_MADD(QPX_x3, QPX_s, QPX_y3)
-    QPX_y4 = VEC_MADD(QPX_x4, QPX_s, QPX_y4)
-    QPX_y5 = VEC_MADD(QPX_x5, QPX_s, QPX_y5)
-    QPX_y6 = VEC_MADD(QPX_x6, QPX_s, QPX_y6)
-
-    !--- rank-2 update of q ---
-
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q3 = VEC_LD(0,q(9,1))
-    QPX_q4 = VEC_LD(0,q(13,1))
-    QPX_q5 = VEC_LD(0,q(17,1))
-    QPX_q6 = VEC_LD(0,q(21,1))
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
-    QPX_q3 = VEC_ADD(QPX_q3, QPX_y3)
-    QPX_q4 = VEC_ADD(QPX_q4, QPX_y4)
-    QPX_q5 = VEC_ADD(QPX_q5, QPX_y5)
-    QPX_q6 = VEC_ADD(QPX_q6, QPX_y6)
-    call VEC_ST(QPX_q1, 0, q(1,1))
-    call VEC_ST(QPX_q2, 0, q(5,1))
-    call VEC_ST(QPX_q3, 0, q(9,1))
-    call VEC_ST(QPX_q4, 0, q(13,1))
-    call VEC_ST(QPX_q5, 0, q(17,1))
-    call VEC_ST(QPX_q6, 0, q(21,1))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,2))
-    QPX_q2 = VEC_LD(0,q(5,2))
-    QPX_q3 = VEC_LD(0,q(9,2))
-    QPX_q4 = VEC_LD(0,q(13,2))
-    QPX_q5 = VEC_LD(0,q(17,2))
-    QPX_q6 = VEC_LD(0,q(21,2))
-    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-    QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
-    QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
-    QPX_q5 = VEC_MADD(QPX_y5, QPX_h2, QPX_q5)
-    QPX_q6 = VEC_MADD(QPX_y6, QPX_h2, QPX_q6)
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
-    QPX_q3 = VEC_ADD(QPX_q3, QPX_x3)
-    QPX_q4 = VEC_ADD(QPX_q4, QPX_x4)
-    QPX_q5 = VEC_ADD(QPX_q5, QPX_x5)
-    QPX_q6 = VEC_ADD(QPX_q6, QPX_x6)
-    call VEC_ST(QPX_q1, 0, q(1,2))
-    call VEC_ST(QPX_q2, 0, q(5,2))
-    call VEC_ST(QPX_q3, 0, q(9,2))
-    call VEC_ST(QPX_q4, 0, q(13,2))
-    call VEC_ST(QPX_q5, 0, q(17,2))
-    call VEC_ST(QPX_q6, 0, q(21,2))
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_q3 = VEC_LD(0,q(9,i))
-       QPX_q4 = VEC_LD(0,q(13,i))
-       QPX_q5 = VEC_LD(0,q(17,i))
-       QPX_q6 = VEC_LD(0,q(21,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-       QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
-       QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
-       QPX_q5 = VEC_MADD(QPX_x5, QPX_h1, QPX_q5)
-       QPX_q6 = VEC_MADD(QPX_x6, QPX_h1, QPX_q6)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-       QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
-       QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
-       QPX_q5 = VEC_MADD(QPX_y5, QPX_h2, QPX_q5)
-       QPX_q6 = VEC_MADD(QPX_y6, QPX_h2, QPX_q6)
-
-       call VEC_ST(QPX_q1, 0, q(1,i))
-       call VEC_ST(QPX_q2, 0, q(5,i))
-       call VEC_ST(QPX_q3, 0, q(9,i))
-       call VEC_ST(QPX_q4, 0, q(13,i))
-       call VEC_ST(QPX_q5, 0, q(17,i))
-       call VEC_ST(QPX_q6, 0, q(21,i))
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q3 = VEC_LD(0,q(9,nb+1))
-    QPX_q4 = VEC_LD(0,q(13,nb+1))
-    QPX_q5 = VEC_LD(0,q(17,nb+1))
-    QPX_q6 = VEC_LD(0,q(21,nb+1))
-    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-    QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
-    QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
-    QPX_q5 = VEC_MADD(QPX_x5, QPX_h1, QPX_q5)
-    QPX_q6 = VEC_MADD(QPX_x6, QPX_h1, QPX_q6)
-    call VEC_ST(QPX_q1, 0, q(1,nb+1))
-    call VEC_ST(QPX_q2, 0, q(5,nb+1))
-    call VEC_ST(QPX_q3, 0, q(9,nb+1))
-    call VEC_ST(QPX_q4, 0, q(13,nb+1))
-    call VEC_ST(QPX_q5, 0, q(17,nb+1))
-    call VEC_ST(QPX_q6, 0, q(21,nb+1))
-
-  end subroutine hh_trafo_kernel_24_bgq
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_16_bgq(q, hh, nb, ldq, ldh, s)
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in) :: nb, ldq, ldh
-
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*), s
-
-    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_x3, QPX_x4
-    VECTOR(REAL(8))::QPX_y1, QPX_y2, QPX_y3, QPX_y4
-    VECTOR(REAL(8))::QPX_q1, QPX_q2, QPX_q3, QPX_q4
-    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
-    integer i
-
-    call alignx(32,q)
-
-    !--- multiply Householder vectors with matrix q ---
-
-    QPX_x1 = VEC_LD(0,q(1,2))
-    QPX_x2 = VEC_LD(0,q(5,2))
-    QPX_x3 = VEC_LD(0,q(9,2))
-    QPX_x4 = VEC_LD(0,q(13,2))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q3 = VEC_LD(0,q(9,1))
-    QPX_q4 = VEC_LD(0,q(13,1))
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
-    QPX_y3 = VEC_MADD(QPX_x3, QPX_h2, QPX_q3)
-    QPX_y4 = VEC_MADD(QPX_x4, QPX_h2, QPX_q4)
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_q3 = VEC_LD(0,q(9,i))
-       QPX_q4 = VEC_LD(0,q(13,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-       QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
-       QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
-       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
-       QPX_y3 = VEC_MADD(QPX_q3, QPX_h2, QPX_y3)
-       QPX_y4 = VEC_MADD(QPX_q4, QPX_h2, QPX_y4)
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q3 = VEC_LD(0,q(9,nb+1))
-    QPX_q4 = VEC_LD(0,q(13,nb+1))
-    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-    QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3)
-    QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4)
-
-    !--- multiply T matrix ---
-
-    QPX_tau1 = VEC_SPLATS(-hh(1,1))
-    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
-    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
-    QPX_x3 = VEC_MUL(QPX_x3, QPX_tau1)
-    QPX_x4 = VEC_MUL(QPX_x4, QPX_tau1)
-    QPX_tau2 = VEC_SPLATS(-hh(1,2))
-    QPX_s = VEC_SPLATS(-hh(1,2)*s)
-    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
-    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
-    QPX_y3 = VEC_MUL(QPX_y3, QPX_tau2)
-    QPX_y4 = VEC_MUL(QPX_y4, QPX_tau2)
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
-    QPX_y3 = VEC_MADD(QPX_x3, QPX_s, QPX_y3)
-    QPX_y4 = VEC_MADD(QPX_x4, QPX_s, QPX_y4)
-
-    !--- rank-2 update of q ---
-
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q3 = VEC_LD(0,q(9,1))
-    QPX_q4 = VEC_LD(0,q(13,1))
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
-    QPX_q3 = VEC_ADD(QPX_q3, QPX_y3)
-    QPX_q4 = VEC_ADD(QPX_q4, QPX_y4)
-    call VEC_ST(QPX_q1, 0, q(1,1))
-    call VEC_ST(QPX_q2, 0, q(5,1))
-    call VEC_ST(QPX_q3, 0, q(9,1))
-    call VEC_ST(QPX_q4, 0, q(13,1))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,2))
-    QPX_q2 = VEC_LD(0,q(5,2))
-    QPX_q3 = VEC_LD(0,q(9,2))
-    QPX_q4 = VEC_LD(0,q(13,2))
-    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-    QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
-    QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
-    QPX_q3 = VEC_ADD(QPX_q3, QPX_x3)
-    QPX_q4 = VEC_ADD(QPX_q4, QPX_x4)
-    call VEC_ST(QPX_q1, 0, q(1,2))
-    call VEC_ST(QPX_q2, 0, q(5,2))
-    call VEC_ST(QPX_q3, 0, q(9,2))
-    call VEC_ST(QPX_q4, 0, q(13,2))
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_q3 = VEC_LD(0,q(9,i))
-       QPX_q4 = VEC_LD(0,q(13,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-       QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
-       QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-       QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3)
-       QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4)
-
-       call VEC_ST(QPX_q1, 0, q(1,i))
-       call VEC_ST(QPX_q2, 0, q(5,i))
-       call VEC_ST(QPX_q3, 0, q(9,i))
-       call VEC_ST(QPX_q4, 0, q(13,i))
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q3 = VEC_LD(0,q(9,nb+1))
-    QPX_q4 = VEC_LD(0,q(13,nb+1))
-    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-    QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3)
-    QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4)
-    call VEC_ST(QPX_q1, 0, q(1,nb+1))
-    call VEC_ST(QPX_q2, 0, q(5,nb+1))
-    call VEC_ST(QPX_q3, 0, q(9,nb+1))
-    call VEC_ST(QPX_q4, 0, q(13,nb+1))
-
-  end subroutine hh_trafo_kernel_16_bgq
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_8_bgq(q, hh, nb, ldq, ldh, s)
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in) :: nb, ldq, ldh
-
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*), s
-    integer(kind=ik)             :: i
-    VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_y1, QPX_y2
-    VECTOR(REAL(8))::QPX_q1, QPX_q2
-    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
-
-
-    call alignx(32,q)
-
-    !--- multiply Householder vectors with matrix q ---
-
-    QPX_x1 = VEC_LD(0,q(1,2))
-    QPX_x2 = VEC_LD(0,q(5,2))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2)
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-       QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
-       QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2)
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-    QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2)
-
-    !--- multiply T matrix ---
-
-    QPX_tau1 = VEC_SPLATS(-hh(1,1))
-    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
-    QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1)
-    QPX_tau2 = VEC_SPLATS(-hh(1,2))
-    QPX_s = VEC_SPLATS(-hh(1,2)*s)
-    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
-    QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2)
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
-    QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2)
-
-    !--- rank-2 update of q ---
-
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q2 = VEC_LD(0,q(5,1))
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_y2)
-    call VEC_ST(QPX_q1, 0, q(1,1))
-    call VEC_ST(QPX_q2, 0, q(5,1))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,2))
-    QPX_q2 = VEC_LD(0,q(5,2))
-    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
-    QPX_q2 = VEC_ADD(QPX_q2, QPX_x2)
-    call VEC_ST(QPX_q1, 0, q(1,2))
-    call VEC_ST(QPX_q2, 0, q(5,2))
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_q2 = VEC_LD(0,q(5,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-       QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2)
-
-       call VEC_ST(QPX_q1, 0, q(1,i))
-       call VEC_ST(QPX_q2, 0, q(5,i))
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q2 = VEC_LD(0,q(5,nb+1))
-    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-    QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2)
-    call VEC_ST(QPX_q1, 0, q(1,nb+1))
-    call VEC_ST(QPX_q2, 0, q(5,nb+1))
-
-  end subroutine hh_trafo_kernel_8_bgq
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_4_bgq(q, hh, nb, ldq, ldh, s)
-    use precision
-    implicit none
-
-    include 'mpif.h'
-
-    integer(kind=ik), intent(in) :: nb, ldq, ldh
-
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*), s
-    integer(kind=ik)             :: i
-    VECTOR(REAL(8))::QPX_x1, QPX_y1
-    VECTOR(REAL(8))::QPX_q1
-    VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s
-
-    call alignx(32,q)
-
-    !--- multiply Householder vectors with matrix q ---
-
-    QPX_x1 = VEC_LD(0,q(1,2))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1)
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1)
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1)
-
-    !--- multiply T matrix ---
-
-    QPX_tau1 = VEC_SPLATS(-hh(1,1))
-    QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1)
-    QPX_tau2 = VEC_SPLATS(-hh(1,2))
-    QPX_s = VEC_SPLATS(-hh(1,2)*s)
-    QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2)
-    QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1)
-
-    !--- rank-2 update of q ---
-
-    QPX_q1 = VEC_LD(0,q(1,1))
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_y1)
-    call VEC_ST(QPX_q1, 0, q(1,1))
-
-    QPX_h2 = VEC_SPLATS(hh(2,2))
-    QPX_q1 = VEC_LD(0,q(1,2))
-    QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-    QPX_q1 = VEC_ADD(QPX_q1, QPX_x1)
-    call VEC_ST(QPX_q1, 0, q(1,2))
-
-    do i=3,nb,1
-
-       QPX_q1 = VEC_LD(0,q(1,i))
-       QPX_h1 = VEC_SPLATS(hh(i-1,1))
-       QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-       QPX_h2 = VEC_SPLATS(hh(i,2))
-       QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1)
-
-       call VEC_ST(QPX_q1, 0, q(1,i))
-
-    enddo
-
-    QPX_h1 = VEC_SPLATS(hh(nb,1))
-    QPX_q1 = VEC_LD(0,q(1,nb+1))
-    QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1)
-    call VEC_ST(QPX_q1, 0, q(1,nb+1))
-
-  end subroutine hh_trafo_kernel_4_bgq
-end module real_bgq_kernel
-! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real.F90 elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real.F90
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,662 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-! It should be compiled with the highest possible optimization level.
-!
-! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU)
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-#include "config-f90.h"
-
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-#define PACK_REAL_TO_COMPLEX
-#else
-#undef PACK_REAL_TO_COMPLEX
-#endif
-
-#ifndef DESPERATELY_WANT_ASSUMED_SIZE
-module real_generic_kernel
-
-  private
-  public double_hh_trafo_generic
-contains
-#endif
-
-
-  subroutine double_hh_trafo_generic(q, hh, nb, nq, ldq, ldh)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    use iso_c_binding
-    implicit none
-
-    integer(kind=ik), intent(in)      :: nb, nq, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    real(kind=rk), intent(inout)      :: q(ldq,*)
-    real(kind=rk), intent(in)         :: hh(ldh,*)
-#else
-    real(kind=rk), intent(inout)      :: q(1:ldq,1:nb+1)
-    real(kind=rk), intent(in)         :: hh(1:ldh,1:6)
-#endif
-
-    real(kind=rk)                     :: s
-    integer(kind=ik)                  :: i
-
-!    equivalence(q(1,1),q_complex(1,1))
-
-    ! Safety only:
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic: double_hh_trafo_generic")
-#endif
-    if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
-
-    ! Calculate dot product of the two Householder vectors
-
-    s = hh(2,2)*1
-    do i=3,nb
-       s = s+hh(i,2)*hh(i-1,1)
-    enddo
-
-    ! Do the Householder transformations
-
-#ifndef DESPERATELY_WANT_ASSUMED_SIZE
-!    ! assign real data to compplex pointer
-!    call c_f_pointer(c_loc(q), q_complex, [size(q,dim=1)/2,size(q,dim=2)])
-#endif
-    ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller
-
-    do i=1,nq-8,12
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       call hh_trafo_kernel_12_generic(q(i,1),hh, nb, ldq, ldh, s)
-#else
-       call hh_trafo_kernel_12_generic(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-    enddo
-
-    ! i > nq-8 now, i.e. at most 8 rows remain
-
-    if(nq-i+1 > 4) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    print *,"calling 8"
-       call hh_trafo_kernel_8_generic(q(i,1),hh, nb, ldq, ldh, s)
-#else
-       call hh_trafo_kernel_8_generic(q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-
-    else if(nq-i+1 > 0) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-       print *,"calling 4"
-       call hh_trafo_kernel_4_generic(q(i,1),hh, nb, ldq, ldh, s)
-#else
-       call hh_trafo_kernel_4_generic(q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s)
-#endif
-
-    endif
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic: double_hh_trafo_generic")
-#endif
-  end subroutine double_hh_trafo_generic
-
-  ! --------------------------------------------------------------------------------------------------
-  ! The following kernels perform the Householder transformation on Q for 12/8/4 rows.
-  ! Please note that Q is declared complex*16 here.
-  ! This is a hint for compilers that packed arithmetic can be used for Q
-  ! (relevant for Intel SSE and BlueGene double hummer CPUs).
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_12_generic(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*)
-#else
-    real(kind=rk), intent(inout)    :: q(:,:)
-    real(kind=rk), intent(in)       :: hh(ldh,2)
-#endif
-    real(kind=rk), intent(in)       :: s
-
-#ifdef PACK_REAL_TO_COMPLEX
-    complex(kind=ck)                :: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6
-#else
-    real(kind=rk)                   :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, &
-                                       y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12
-#endif
-    real(kind=rk)                   :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic: hh_trafo_kernel_12_generic")
-#endif
-    x1  = q(1,2)
-    x2  = q(2,2)
-    x3  = q(3,2)
-    x4  = q(4,2)
-    x5  = q(5,2)
-    x6  = q(6,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    x7  = q(7,2)
-    x8  = q(8,2)
-    x9  = q(9,2)
-    x10 = q(10,2)
-    x11 = q(11,2)
-    x12 = q(12,2)
-#endif
-
-    y1  = q(1 ,1) + q(1, 2)*hh(2,2)
-    y2  = q(2 ,1) + q(2, 2)*hh(2,2)
-    y3  = q(3 ,1) + q(3, 2)*hh(2,2)
-    y4  = q(4 ,1) + q(4, 2)*hh(2,2)
-    y5  = q(5 ,1) + q(5, 2)*hh(2,2)
-    y6  = q(6 ,1) + q(6, 2)*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    y7  = q(7 ,1) + q(7, 2)*hh(2,2)
-    y8  = q(8 ,1) + q(8, 2)*hh(2,2)
-    y9  = q(9 ,1) + q(9, 2)*hh(2,2)
-    y10 = q(10,1) + q(10,2)*hh(2,2)
-    y11 = q(11,1) + q(11,2)*hh(2,2)
-    y12 = q(12,1) + q(12,2)*hh(2,2)
-#endif
-
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1  = hh(i-1,1)
-       h2  = hh(i,2)
-       x1  = x1 + q(1, i)*h1
-       y1  = y1 + q(1, i)*h2
-       x2  = x2 + q(2, i)*h1
-       y2  = y2 + q(2, i)*h2
-       x3  = x3 + q(3, i)*h1
-       y3  = y3 + q(3, i)*h2
-       x4  = x4 + q(4, i)*h1
-       y4  = y4 + q(4, i)*h2
-       x5  = x5 + q(5, i)*h1
-       y5  = y5 + q(5, i)*h2
-       x6  = x6 + q(6, i)*h1
-       y6  = y6 + q(6, i)*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       x7  = x7  + q(7, i)*h1
-       y7  = y7  + q(7, i)*h2
-       x8  = x8  + q(8, i)*h1
-       y8  = y8  + q(8, i)*h2
-       x9  = x9  + q(9, i)*h1
-       y9  = y9  + q(9, i)*h2
-       x10 = x10 + q(10,i)*h1
-       y10 = y10 + q(10,i)*h2
-       x11 = x11 + q(11,i)*h1
-       y11 = y11 + q(11,i)*h2
-       x12 = x12 + q(12,i)*h1
-       y12 = y12 + q(12,i)*h2
-#endif
-    enddo
-
-    x1  = x1  + q(1,nb+1)*hh(nb,1)
-    x2  = x2  + q(2,nb+1)*hh(nb,1)
-    x3  = x3  + q(3,nb+1)*hh(nb,1)
-    x4  = x4  + q(4,nb+1)*hh(nb,1)
-    x5  = x5  + q(5,nb+1)*hh(nb,1)
-    x6  = x6  + q(6,nb+1)*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    x7  = x7  + q(7, nb+1)*hh(nb,1)
-    x8  = x8  + q(8, nb+1)*hh(nb,1)
-    x9  = x9  + q(9, nb+1)*hh(nb,1)
-    x10 = x10 + q(10,nb+1)*hh(nb,1)
-    x11 = x11 + q(11,nb+1)*hh(nb,1)
-    x12 = x12 + q(12,nb+1)*hh(nb,1)
-
-#endif
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1  = -tau1
-    x1  = x1 *h1
-    x2  = x2 *h1
-    x3  = x3 *h1
-    x4  = x4 *h1
-    x5  = x5 *h1
-    x6  = x6 *h1
-#ifndef PACK_REAL_TO_COMPLEX
-    x7  = x7 *h1
-    x8  = x8 *h1
-    x9  = x9 *h1
-    x10 = x10*h1
-    x11 = x11*h1
-    x12 = x12*h1
-#endif
-
-    h1  = -tau2
-    h2  = -tau2*s
-    y1  = y1 *h1 + x1 *h2
-    y2  = y2 *h1 + x2 *h2
-    y3  = y3 *h1 + x3 *h2
-    y4  = y4 *h1 + x4 *h2
-    y5  = y5 *h1 + x5 *h2
-    y6  = y6 *h1 + x6 *h2
-#ifndef PACK_REAL_TO_COMPLEX
-    y7  = y7 *h1 + x7 *h2
-    y8  = y8 *h1 + x8 *h2
-    y9  = y9 *h1 + x9 *h2
-    y10 = y10*h1 + x10*h2
-    y11 = y11*h1 + x11*h2
-    y12 = y12*h1 + x12*h2
-#endif
-    q(1,1)  = q(1, 1) + y1
-    q(2,1)  = q(2, 1) + y2
-    q(3,1)  = q(3, 1) + y3
-    q(4,1)  = q(4, 1) + y4
-    q(5,1)  = q(5, 1) + y5
-    q(6,1)  = q(6, 1) + y6
-#ifndef PACK_REAL_TO_COMPLEX
-    q(7 ,1) = q(7, 1) + y7
-    q(8 ,1) = q(8, 1) + y8
-    q(9 ,1) = q(9, 1) + y9
-    q(10,1) = q(10,1) + y10
-    q(11,1) = q(11,1) + y11
-    q(12,1) = q(12,1) + y12
-#endif
-
-    q(1, 2) = q(1, 2) + x1  + y1 *hh(2,2)
-    q(2, 2) = q(2, 2) + x2  + y2 *hh(2,2)
-    q(3, 2) = q(3, 2) + x3  + y3 *hh(2,2)
-    q(4, 2) = q(4, 2) + x4  + y4 *hh(2,2)
-    q(5, 2) = q(5, 2) + x5  + y5 *hh(2,2)
-    q(6, 2) = q(6, 2) + x6  + y6 *hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(7, 2) = q(7, 2) + x7  + y7 *hh(2,2)
-    q(8, 2) = q(8, 2) + x8  + y8 *hh(2,2)
-    q(9, 2) = q(9, 2) + x9  + y9 *hh(2,2)
-    q(10,2) = q(10,2) + x10 + y10*hh(2,2)
-    q(11,2) = q(11,2) + x11 + y11*hh(2,2)
-    q(12,2) = q(12,2) + x12 + y12*hh(2,2)
-#endif
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1, i) = q(1,i)  + x1 *h1 + y1 *h2
-       q(2, i) = q(2,i)  + x2 *h1 + y2 *h2
-       q(3, i) = q(3,i)  + x3 *h1 + y3 *h2
-       q(4, i) = q(4,i)  + x4 *h1 + y4 *h2
-       q(5, i) = q(5,i)  + x5 *h1 + y5 *h2
-       q(6, i) = q(6,i)  + x6 *h1 + y6 *h2
-#ifndef PACK_REAL_TO_COMPLEX
-       q(7, i) = q(7, i) + x7 *h1 + y7 *h2
-       q(8, i) = q(8, i) + x8 *h1 + y8 *h2
-       q(9, i) = q(9, i) + x9 *h1 + y9 *h2
-       q(10,i) = q(10,i) + x10*h1 + y10*h2
-       q(11,i) = q(11,i) + x11*h1 + y11*h2
-       q(12,i) = q(12,i) + x12*h1 + y12*h2
-#endif
-    enddo
-
-    q(1, nb+1) = q(1, nb+1) + x1 *hh(nb,1)
-    q(2, nb+1) = q(2, nb+1) + x2 *hh(nb,1)
-    q(3, nb+1) = q(3, nb+1) + x3 *hh(nb,1)
-    q(4, nb+1) = q(4, nb+1) + x4 *hh(nb,1)
-    q(5, nb+1) = q(5, nb+1) + x5 *hh(nb,1)
-    q(6, nb+1) = q(6, nb+1) + x6 *hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(7, nb+1) = q(7, nb+1) + x7 *hh(nb,1)
-    q(8, nb+1) = q(8, nb+1) + x8 *hh(nb,1)
-    q(9, nb+1) = q(9, nb+1) + x9 *hh(nb,1)
-    q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1)
-    q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
-    q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
-#endif
-
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic: hh_trafo_kernel_12_generic")
-#endif
-  end subroutine hh_trafo_kernel_12_generic
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_8_generic(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)     :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout)  :: q(ldq/2,*)
-    real(kind=rk), intent(in)        :: hh(ldh,*)
-#else
-    real(kind=rk),   intent(inout)   :: q(:,:)
-    real(kind=rk), intent(in)        :: hh(ldh,2)
-#endif
-    real(kind=rk), intent(in)        :: s
-#ifdef PACK_REAL_TO_COMPLEX
-    complex(kind=ck)                 :: x1, x2, x3, x4, y1, y2, y3, y4
-#else
-    real(kind=rk)                    :: x1, x2, x3, x4, x5, x6, x7, x8, &
-                                        y1, y2, y3, y4, y5, y6, y7, y8
-#endif
-    real(kind=rk)                    :: h1, h2, tau1, tau2
-    integer(kind=ik)                 :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic: hh_trafo_kernel_8_generic")
-#endif
-    x1 = q(1,2)
-    x2 = q(2,2)
-    x3 = q(3,2)
-    x4 = q(4,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    x5 = q(5,2)
-    x6 = q(6,2)
-    x7 = q(7,2)
-    x8 = q(8,2)
-#endif
-
-    y1 = q(1,1) + q(1,2)*hh(2,2)
-    y2 = q(2,1) + q(2,2)*hh(2,2)
-    y3 = q(3,1) + q(3,2)*hh(2,2)
-    y4 = q(4,1) + q(4,2)*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    y5 = q(5,1) + q(5,2)*hh(2,2)
-    y6 = q(6,1) + q(6,2)*hh(2,2)
-    y7 = q(7,1) + q(7,2)*hh(2,2)
-    y8 = q(8,1) + q(8,2)*hh(2,2)
-#endif
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       x5 = x5 + q(5,i)*h1
-       y5 = y5 + q(5,i)*h2
-       x6 = x6 + q(6,i)*h1
-       y6 = y6 + q(6,i)*h2
-       x7 = x7 + q(7,i)*h1
-       y7 = y7 + q(7,i)*h2
-       x8 = x8 + q(8,i)*h1
-       y8 = y8 + q(8,i)*h2
-#endif
-    enddo
-
-    x1 = x1 + q(1,nb+1)*hh(nb,1)
-    x2 = x2 + q(2,nb+1)*hh(nb,1)
-    x3 = x3 + q(3,nb+1)*hh(nb,1)
-    x4 = x4 + q(4,nb+1)*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    x5 = x5 + q(5,nb+1)*hh(nb,1)
-    x6 = x6 + q(6,nb+1)*hh(nb,1)
-    x7 = x7 + q(7,nb+1)*hh(nb,1)
-    x8 = x8 + q(8,nb+1)*hh(nb,1)
-#endif
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-    x3 = x3*h1
-    x4 = x4*h1
-#ifndef PACK_REAL_TO_COMPLEX
-    x5 = x5*h1
-    x6 = x6*h1
-    x7 = x7*h1
-    x8 = x8*h1
-#endif
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-#ifndef PACK_REAL_TO_COMPLEX
-    y5 = y5*h1 + x5*h2
-    y6 = y6*h1 + x6*h2
-    y7 = y7*h1 + x7*h2
-    y8 = y8*h1 + x8*h2
-#endif
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-#ifndef PACK_REAL_TO_COMPLEX
-    q(5,1) = q(5,1) + y5
-    q(6,1) = q(6,1) + y6
-    q(7,1) = q(7,1) + y7
-    q(8,1) = q(8,1) + y8
-#endif
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(5,2) = q(5,2) + x5 + y5*hh(2,2)
-    q(6,2) = q(6,2) + x6 + y6*hh(2,2)
-    q(7,2) = q(7,2) + x7 + y7*hh(2,2)
-    q(8,2) = q(8,2) + x8 + y8*hh(2,2)
-#endif
-
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       q(5,i) = q(5,i) + x5*h1 + y5*h2
-       q(6,i) = q(6,i) + x6*h1 + y6*h2
-       q(7,i) = q(7,i) + x7*h1 + y7*h2
-       q(8,i) = q(8,i) + x8*h1 + y8*h2
-#endif
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1)
-    q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1)
-    q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
-    q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic: hh_trafo_kernel_8_generic")
-#endif
-
-  end subroutine hh_trafo_kernel_8_generic
-
-  ! --------------------------------------------------------------------------------------------------
-
-  subroutine hh_trafo_kernel_4_generic(q, hh, nb, ldq, ldh, s)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in)    :: nb, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    complex(kind=ck), intent(inout) :: q(ldq/2,*)
-    real(kind=rk), intent(in)       :: hh(ldh,*)
-#else
-    real(kind=rk), intent(inout)    :: q(:,:) !q(1:ldq/2,1:nb+1)
-    real(kind=rk), intent(in)       :: hh(ldh,2)
-#endif
-    real(kind=rk), intent(in)       :: s
-
-#ifdef PACK_REAL_TO_COMPLEX
-    complex(kind=ck)                :: x1, x2, y1, y2
-#else
-    real(kind=rk)                   :: x1, x2, x3, x4, y1, y2, y3, y4
-#endif
-    real(kind=rk)                   :: h1, h2, tau1, tau2
-    integer(kind=ik)                :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic: hh_trafo_kernel_4_generic")
-#endif
-    x1 = q(1,2)
-    x2 = q(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    x3 = q(3,2)
-    x4 = q(4,2)
-#endif
-
-    y1 = q(1,1) + q(1,2)*hh(2,2)
-    y2 = q(2,1) + q(2,2)*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    y3 = q(3,1) + q(3,2)*hh(2,2)
-    y4 = q(4,1) + q(4,2)*hh(2,2)
-#endif
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       x1 = x1 + q(1,i)*h1
-       y1 = y1 + q(1,i)*h2
-       x2 = x2 + q(2,i)*h1
-       y2 = y2 + q(2,i)*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       x3 = x3 + q(3,i)*h1
-       y3 = y3 + q(3,i)*h2
-       x4 = x4 + q(4,i)*h1
-       y4 = y4 + q(4,i)*h2
-#endif
-    enddo
-
-    x1 = x1 + q(1,nb+1)*hh(nb,1)
-    x2 = x2 + q(2,nb+1)*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    x3 = x3 + q(3,nb+1)*hh(nb,1)
-    x4 = x4 + q(4,nb+1)*hh(nb,1)
-#endif
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x1 = x1*h1
-    x2 = x2*h1
-#ifndef PACK_REAL_TO_COMPLEX
-    x3 = x3*h1
-    x4 = x4*h1
-#endif
-    h1 = -tau2
-    h2 = -tau2*s
-    y1 = y1*h1 + x1*h2
-    y2 = y2*h1 + x2*h2
-#ifndef PACK_REAL_TO_COMPLEX
-    y3 = y3*h1 + x3*h2
-    y4 = y4*h1 + x4*h2
-#endif
-
-    q(1,1) = q(1,1) + y1
-    q(2,1) = q(2,1) + y2
-#ifndef PACK_REAL_TO_COMPLEX
-    q(3,1) = q(3,1) + y3
-    q(4,1) = q(4,1) + y4
-#endif
-    q(1,2) = q(1,2) + x1 + y1*hh(2,2)
-    q(2,2) = q(2,2) + x2 + y2*hh(2,2)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(3,2) = q(3,2) + x3 + y3*hh(2,2)
-    q(4,2) = q(4,2) + x4 + y4*hh(2,2)
-#endif
-
-    !DEC$ VECTOR ALIGNED
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1,i) = q(1,i) + x1*h1 + y1*h2
-       q(2,i) = q(2,i) + x2*h1 + y2*h2
-#ifndef PACK_REAL_TO_COMPLEX
-       q(3,i) = q(3,i) + x3*h1 + y3*h2
-       q(4,i) = q(4,i) + x4*h1 + y4*h2
-#endif
-    enddo
-
-    q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
-    q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
-#ifndef PACK_REAL_TO_COMPLEX
-    q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
-    q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic: hh_trafo_kernel_4_generic")
-#endif
-
-  end subroutine hh_trafo_kernel_4_generic
-#ifndef DESPERATELY_WANT_ASSUMED_SIZE
-end module real_generic_kernel
-#endif
-! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_simple.F90 elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_simple.F90
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_simple.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_simple.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,136 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! --------------------------------------------------------------------------------------------------
-!
-! This file contains the compute intensive kernels for the Householder transformations.
-!
-! This is the small and simple version (no hand unrolling of loops etc.) but for some
-! compilers this performs better than a sophisticated version with transformed and unrolled loops.
-!
-! It should be compiled with the highest possible optimization level.
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-module real_generic_simple_kernel
-
-  private
-  public double_hh_trafo_generic_simple
-contains
-  subroutine double_hh_trafo_generic_simple(q, hh, nb, nq, ldq, ldh)
-    use precision
-#ifdef HAVE_DETAILED_TIMINGS
-    use timings
-#endif
-    implicit none
-
-    integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-    real(kind=rk), intent(inout) :: q(ldq,*)
-    real(kind=rk), intent(in)    :: hh(ldh,*)
-#else
-    real(kind=rk), intent(inout) :: q(ldq,1:nb+1)
-    real(kind=rk), intent(in)    :: hh(ldh,2)
-#endif
-
-    real(kind=rk)                :: s, h1, h2, tau1, tau2, x(nq), y(nq)
-    integer(kind=ik)             :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%start("kernel generic simple: double_hh_trafo_generic_simple")
-#endif
-    ! Calculate dot product of the two Householder vectors
-
-    s = hh(2,2)*1
-    do i=3,nb
-       s = s+hh(i,2)*hh(i-1,1)
-    enddo
-
-    ! Do the Householder transformations
-
-    x(1:nq) = q(1:nq,2)
-
-    y(1:nq) = q(1:nq,1) + q(1:nq,2)*hh(2,2)
-
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       x(1:nq) = x(1:nq) + q(1:nq,i)*h1
-       y(1:nq) = y(1:nq) + q(1:nq,i)*h2
-    enddo
-
-    x(1:nq) = x(1:nq) + q(1:nq,nb+1)*hh(nb,1)
-
-    tau1 = hh(1,1)
-    tau2 = hh(1,2)
-
-    h1 = -tau1
-    x(1:nq) = x(1:nq)*h1
-    h1 = -tau2
-    h2 = -tau2*s
-    y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
-
-    q(1:nq,1) = q(1:nq,1) + y(1:nq)
-    q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
-
-    do i=3,nb
-       h1 = hh(i-1,1)
-       h2 = hh(i,2)
-       q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
-    enddo
-
-    q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
-
-#ifdef HAVE_DETAILED_TIMINGS
-    call timer%stop("kernel generic simple: double_hh_trafo_generic_simple")
-#endif
-
-  end subroutine double_hh_trafo_generic_simple
-end module real_generic_simple_kernel
-! --------------------------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,575 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-
-#ifdef HAVE_SSE_INTRINSICS
-#undef __AVX__
-#endif
-
-//Forward declaration
-__forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-__forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
-
-/*
-!f>#ifdef HAVE_SSE_INTRINSICS
-!f> interface
-!f>   subroutine double_hh_trafo_real_sse_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_real_sse_2hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
-!f>     real(kind=c_double)     :: q(*)
-!f>     real(kind=c_double)     :: hh(pnb,6)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void double_hh_trafo_real_sse_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-
-void double_hh_trafo_real_sse_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar product to compute
-	// 2 householder vectors simultaneously
-	double s = hh[(ldh)+1]*1.0;
-
-	#pragma ivdep
-	for (i = 2; i < nb; i++)
-	{
-		s += hh[i-1] * hh[(i+ldh)];
-	}
-
-	// Production level kernel calls with padding
-	for (i = 0; i < nq-8; i+=12)
-	{
-		hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		if (nq-i > 4)
-		{
-			hh_trafo_kernel_8_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-		}
-		else if (nq-i > 0)
-		{
-			hh_trafo_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
-		}
-	}
-}
-/**
- * Unrolled kernel that computes
- * 12 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
- __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [12 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000);
-	__m128d sign = (__m128d)_mm_set1_epi64(smallsign);
-
-	__m128d x1 = _mm_load_pd(&q[ldq]);
-	__m128d x2 = _mm_load_pd(&q[ldq+2]);
-	__m128d x3 = _mm_load_pd(&q[ldq+4]);
-	__m128d x4 = _mm_load_pd(&q[ldq+6]);
-	__m128d x5 = _mm_load_pd(&q[ldq+8]);
-	__m128d x6 = _mm_load_pd(&q[ldq+10]);
-
-	__m128d h1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h2;
-
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	__m128d q3 = _mm_load_pd(&q[4]);
-	__m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1));
-	__m128d q4 = _mm_load_pd(&q[6]);
-	__m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1));
-	__m128d q5 = _mm_load_pd(&q[8]);
-	__m128d y5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1));
-	__m128d q6 = _mm_load_pd(&q[10]);
-	__m128d y6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1));
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-		y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1));
-		y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2));
-		q5 = _mm_load_pd(&q[(i*ldq)+8]);
-		x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1));
-		y5 = _mm_add_pd(y5, _mm_mul_pd(q5,h2));
-		q6 = _mm_load_pd(&q[(i*ldq)+10]);
-		x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1));
-		y6 = _mm_add_pd(y6, _mm_mul_pd(q6,h2));
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1));
-	q5 = _mm_load_pd(&q[(nb*ldq)+8]);
-	x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1));
-	q6 = _mm_load_pd(&q[(nb*ldq)+10]);
-	x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1));
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [12 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(hh);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs = _mm_loaddup_pd(&s);
-
-	h1 = _mm_xor_pd(tau1, sign);
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-	x3 = _mm_mul_pd(x3, h1);
-	x4 = _mm_mul_pd(x4, h1);
-	x5 = _mm_mul_pd(x5, h1);
-	x6 = _mm_mul_pd(x6, h1);
-	h1 = _mm_xor_pd(tau2, sign);
-	h2 = _mm_mul_pd(h1, vs);
-
-	y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-	y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2));
-	y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2));
-	y5 = _mm_add_pd(_mm_mul_pd(y5,h1), _mm_mul_pd(x5,h2));
-	y6 = _mm_add_pd(_mm_mul_pd(y6,h1), _mm_mul_pd(x6,h2));
-
-	q1 = _mm_load_pd(q);
-	q1 = _mm_add_pd(q1, y1);
-	_mm_store_pd(q,q1);
-	q2 = _mm_load_pd(&q[2]);
-	q2 = _mm_add_pd(q2, y2);
-	_mm_store_pd(&q[2],q2);
-	q3 = _mm_load_pd(&q[4]);
-	q3 = _mm_add_pd(q3, y3);
-	_mm_store_pd(&q[4],q3);
-	q4 = _mm_load_pd(&q[6]);
-	q4 = _mm_add_pd(q4, y4);
-	_mm_store_pd(&q[6],q4);
-	q5 = _mm_load_pd(&q[8]);
-	q5 = _mm_add_pd(q5, y5);
-	_mm_store_pd(&q[8],q5);
-	q6 = _mm_load_pd(&q[10]);
-	q6 = _mm_add_pd(q6, y6);
-	_mm_store_pd(&q[10],q6);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2)));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2)));
-	_mm_store_pd(&q[ldq+2],q2);
-	q3 = _mm_load_pd(&q[ldq+4]);
-	q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2)));
-	_mm_store_pd(&q[ldq+4],q3);
-	q4 = _mm_load_pd(&q[ldq+6]);
-	q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2)));
-	_mm_store_pd(&q[ldq+6],q4);
-	q5 = _mm_load_pd(&q[ldq+8]);
-	q5 = _mm_add_pd(q5, _mm_add_pd(x5, _mm_mul_pd(y5, h2)));
-	_mm_store_pd(&q[ldq+8],q5);
-	q6 = _mm_load_pd(&q[ldq+10]);
-	q6 = _mm_add_pd(q6, _mm_add_pd(x6, _mm_mul_pd(y6, h2)));
-	_mm_store_pd(&q[ldq+10],q6);
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2)));
-		_mm_store_pd(&q[(i*ldq)+4],q3);
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2)));
-		_mm_store_pd(&q[(i*ldq)+6],q4);
-		q5 = _mm_load_pd(&q[(i*ldq)+8]);
-		q5 = _mm_add_pd(q5, _mm_add_pd(_mm_mul_pd(x5,h1), _mm_mul_pd(y5, h2)));
-		_mm_store_pd(&q[(i*ldq)+8],q5);
-		q6 = _mm_load_pd(&q[(i*ldq)+10]);
-		q6 = _mm_add_pd(q6, _mm_add_pd(_mm_mul_pd(x6,h1), _mm_mul_pd(y6, h2)));
-		_mm_store_pd(&q[(i*ldq)+10],q6);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1));
-	_mm_store_pd(&q[(nb*ldq)+4],q3);
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1));
-	_mm_store_pd(&q[(nb*ldq)+6],q4);
-	q5 = _mm_load_pd(&q[(nb*ldq)+8]);
-	q5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1));
-	_mm_store_pd(&q[(nb*ldq)+8],q5);
-	q6 = _mm_load_pd(&q[(nb*ldq)+10]);
-	q6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1));
-	_mm_store_pd(&q[(nb*ldq)+10],q6);
-}
-
-/**
- * Unrolled kernel that computes
- * 8 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
-__forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [8 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000);
-	__m128d sign = (__m128d)_mm_set1_epi64(smallsign);
-
-	__m128d x1 = _mm_load_pd(&q[ldq]);
-	__m128d x2 = _mm_load_pd(&q[ldq+2]);
-	__m128d x3 = _mm_load_pd(&q[ldq+4]);
-	__m128d x4 = _mm_load_pd(&q[ldq+6]);
-
-	__m128d h1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h2;
-
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	__m128d q3 = _mm_load_pd(&q[4]);
-	__m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1));
-	__m128d q4 = _mm_load_pd(&q[6]);
-	__m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1));
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-		y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1));
-		y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2));
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1));
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [8 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(hh);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs = _mm_loaddup_pd(&s);
-
-	h1 = _mm_xor_pd(tau1, sign);
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-	x3 = _mm_mul_pd(x3, h1);
-	x4 = _mm_mul_pd(x4, h1);
-	h1 = _mm_xor_pd(tau2, sign);
-	h2 = _mm_mul_pd(h1, vs);
-
-	y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-	y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2));
-	y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2));
-
-	q1 = _mm_load_pd(q);
-	q1 = _mm_add_pd(q1, y1);
-	_mm_store_pd(q,q1);
-	q2 = _mm_load_pd(&q[2]);
-	q2 = _mm_add_pd(q2, y2);
-	_mm_store_pd(&q[2],q2);
-	q3 = _mm_load_pd(&q[4]);
-	q3 = _mm_add_pd(q3, y3);
-	_mm_store_pd(&q[4],q3);
-	q4 = _mm_load_pd(&q[6]);
-	q4 = _mm_add_pd(q4, y4);
-	_mm_store_pd(&q[6],q4);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2)));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2)));
-	_mm_store_pd(&q[ldq+2],q2);
-	q3 = _mm_load_pd(&q[ldq+4]);
-	q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2)));
-	_mm_store_pd(&q[ldq+4],q3);
-	q4 = _mm_load_pd(&q[ldq+6]);
-	q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2)));
-	_mm_store_pd(&q[ldq+6],q4);
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-		q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2)));
-		_mm_store_pd(&q[(i*ldq)+4],q3);
-		q4 = _mm_load_pd(&q[(i*ldq)+6]);
-		q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2)));
-		_mm_store_pd(&q[(i*ldq)+6],q4);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-	q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1));
-	_mm_store_pd(&q[(nb*ldq)+4],q3);
-	q4 = _mm_load_pd(&q[(nb*ldq)+6]);
-	q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1));
-	_mm_store_pd(&q[(nb*ldq)+6],q4);
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 2 update is performed
- */
-__forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+1] * hh
-	// hh contains two householder vectors, with offset 1
-	/////////////////////////////////////////////////////
-	int i;
-	// Needed bit mask for floating point sign flip
-	__m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000);
-	__m128d sign = (__m128d)_mm_set1_epi64(smallsign);
-
-	__m128d x1 = _mm_load_pd(&q[ldq]);
-	__m128d x2 = _mm_load_pd(&q[ldq+2]);
-
-	__m128d h1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h2;
-
-	__m128d q1 = _mm_load_pd(q);
-	__m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	__m128d q2 = _mm_load_pd(&q[2]);
-	__m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-
-	for(i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-
-	/////////////////////////////////////////////////////
-	// Rank-2 update of Q [12 x nb+1]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(hh);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs = _mm_loaddup_pd(&s);
-
-	h1 = _mm_xor_pd(tau1, sign);
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-	h1 = _mm_xor_pd(tau2, sign);
-	h2 = _mm_mul_pd(h1, vs);
-
-	y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-
-	q1 = _mm_load_pd(q);
-	q1 = _mm_add_pd(q1, y1);
-	_mm_store_pd(q,q1);
-	q2 = _mm_load_pd(&q[2]);
-	q2 = _mm_add_pd(q2, y2);
-	_mm_store_pd(&q[2],q2);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2)));
-	_mm_store_pd(&q[ldq],q1);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2)));
-	_mm_store_pd(&q[ldq+2],q2);
-
-	for (i = 2; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-1]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)));
-		_mm_store_pd(&q[i*ldq],q1);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)));
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1));
-	_mm_store_pd(&q[nb*ldq],q1);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1));
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-}
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1005 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-#ifdef HAVE_SSE_INTRINSICS
-#undef __AVX__
-#endif
-
-//Forward declaration
-__forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-__forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
-
-/*
-!f>#ifdef HAVE_SSE_INTRINSICS
-!f> interface
-!f>   subroutine quad_hh_trafo_real_sse_4hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="quad_hh_trafo_real_sse_4hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
-!f>     real(kind=c_double)     :: q(*)
-!f>     real(kind=c_double)     :: hh(pnb,6)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void quad_hh_trafo_real_sse_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-
-void quad_hh_trafo_real_sse_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 4 householder vectors simultaneously
-	double s_1_2 = hh[(ldh)+1];
-	double s_1_3 = hh[(ldh*2)+2];
-	double s_2_3 = hh[(ldh*2)+1];
-	double s_1_4 = hh[(ldh*3)+3];
-	double s_2_4 = hh[(ldh*3)+2];
-	double s_3_4 = hh[(ldh*3)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	s_1_2 += hh[2-1] * hh[(2+ldh)];
-	s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
-
-	// loop counter = 3
-	s_1_2 += hh[3-1] * hh[(3+ldh)];
-	s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
-
-	s_1_3 += hh[3-2] * hh[3+(ldh*2)];
-	s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
-
-	#pragma ivdep
-	for (i = 4; i < nb; i++)
-	{
-		s_1_2 += hh[i-1] * hh[(i+ldh)];
-		s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-
-		s_1_3 += hh[i-2] * hh[i+(ldh*2)];
-		s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-
-		s_1_4 += hh[i-3] * hh[i+(ldh*3)];
-	}
-
-//	printf("s_1_2: %f\n", s_1_2);
-//	printf("s_1_3: %f\n", s_1_3);
-//	printf("s_2_3: %f\n", s_2_3);
-//	printf("s_1_4: %f\n", s_1_4);
-//	printf("s_2_4: %f\n", s_2_4);
-//	printf("s_3_4: %f\n", s_3_4);
-
-	// Production level kernel calls with padding
-	for (i = 0; i < nq-4; i+=6)
-	{
-		hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		if (nq-i > 2)
-		{
-			hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-		else
-		{
-			hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-		}
-	}
-}
-
-#if 0
-void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 4 householder vectors simultaneously
-	double s_1_2 = hh[(ldh)+1];
-	double s_1_3 = hh[(ldh*2)+2];
-	double s_2_3 = hh[(ldh*2)+1];
-	double s_1_4 = hh[(ldh*3)+3];
-	double s_2_4 = hh[(ldh*3)+2];
-	double s_3_4 = hh[(ldh*3)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	s_1_2 += hh[2-1] * hh[(2+ldh)];
-	s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
-
-	// loop counter = 3
-	s_1_2 += hh[3-1] * hh[(3+ldh)];
-	s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
-	s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
-
-	s_1_3 += hh[3-2] * hh[3+(ldh*2)];
-	s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
-
-	#pragma ivdep
-	for (i = 4; i < nb; i++)
-	{
-		s_1_2 += hh[i-1] * hh[(i+ldh)];
-		s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-
-		s_1_3 += hh[i-2] * hh[i+(ldh*2)];
-		s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-
-		s_1_4 += hh[i-3] * hh[i+(ldh*3)];
-	}
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq; i+=12)
-	{
-		hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-#else
-	for (i = 0; i < nq; i+=6)
-	{
-		hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
-	}
-#endif
-}
-#endif
-/**
- * Unrolled kernel that computes
- * 6 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [6 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq]);
-	__m128d a4_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-	register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-	register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-	register __m128d x1 = a1_1;
-
-	__m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]);
-	__m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]);
-	__m128d a3_2 = _mm_load_pd(&q[ldq+2]);
-	__m128d a4_2 = _mm_load_pd(&q[0+2]);
-
-	register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1));
-	register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1));
-	register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1));
-	register __m128d x2 = a1_2;
-
-	__m128d a1_3 = _mm_load_pd(&q[(ldq*3)+4]);
-	__m128d a2_3 = _mm_load_pd(&q[(ldq*2)+4]);
-	__m128d a3_3 = _mm_load_pd(&q[ldq+4]);
-	__m128d a4_3 = _mm_load_pd(&q[0+4]);
-
-	register __m128d w3 = _mm_add_pd(a4_3, _mm_mul_pd(a3_3, h_4_3));
-	w3 = _mm_add_pd(w3, _mm_mul_pd(a2_3, h_4_2));
-	w3 = _mm_add_pd(w3, _mm_mul_pd(a1_3, h_4_1));
-	register __m128d z3 = _mm_add_pd(a3_3, _mm_mul_pd(a2_3, h_3_2));
-	z3 = _mm_add_pd(z3, _mm_mul_pd(a1_3, h_3_1));
-	register __m128d y3 = _mm_add_pd(a2_3, _mm_mul_pd(a1_3, h_2_1));
-	register __m128d x3 = a1_3;
-
-	__m128d q1;
-	__m128d q2;
-	__m128d q3;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-		y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-		z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-		z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3));
-
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-		w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-		w3 = _mm_add_pd(w3, _mm_mul_pd(q3,h4));
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-	y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-	z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3));
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-	q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-
-	h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-	y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2));
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-	q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1));
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [6 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-
-	h1 = tau1;
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-	x3 = _mm_mul_pd(x3, h1);
-
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs_1_2 = _mm_loaddup_pd(&s_1_2);
-
-	h1 = tau2;
-	h2 = _mm_mul_pd(h1, vs_1_2);
-
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-	y3 = _mm_sub_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2));
-
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d vs_1_3 = _mm_loaddup_pd(&s_1_3);
-	__m128d vs_2_3 = _mm_loaddup_pd(&s_2_3);
-
-	h1 = tau3;
-	h2 = _mm_mul_pd(h1, vs_1_3);
-	h3 = _mm_mul_pd(h1, vs_2_3);
-
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-	z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)));
-	z3 = _mm_sub_pd(_mm_mul_pd(z3,h1), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2)));
-
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-	__m128d vs_1_4 = _mm_loaddup_pd(&s_1_4);
-	__m128d vs_2_4 = _mm_loaddup_pd(&s_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&s_3_4);
-
-	h1 = tau4;
-	h2 = _mm_mul_pd(h1, vs_1_4);
-	h3 = _mm_mul_pd(h1, vs_2_4);
-	h4 = _mm_mul_pd(h1, vs_3_4);
-
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-	w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));
-	w3 = _mm_sub_pd(_mm_mul_pd(w3,h1), _mm_add_pd(_mm_mul_pd(z3,h4), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2))));
-
-	q1 = _mm_load_pd(&q[0]);
-	q2 = _mm_load_pd(&q[2]);
-	q3 = _mm_load_pd(&q[4]);
-	q1 = _mm_sub_pd(q1, w1);
-	q2 = _mm_sub_pd(q2, w2);
-	q3 = _mm_sub_pd(q3, w3);
-	_mm_store_pd(&q[0],q1);
-	_mm_store_pd(&q[2],q2);
-	_mm_store_pd(&q[4],q3);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-	q2 = _mm_load_pd(&q[ldq+2]);
-	q3 = _mm_load_pd(&q[ldq+4]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4)));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4)));
-	q3 = _mm_sub_pd(q3, _mm_add_pd(z3, _mm_mul_pd(w3, h4)));
-
-	_mm_store_pd(&q[ldq],q1);
-	_mm_store_pd(&q[ldq+2],q2);
-	_mm_store_pd(&q[ldq+4],q3);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-	q2 = _mm_load_pd(&q[(ldq*2)+2]);
-	q3 = _mm_load_pd(&q[(ldq*2)+4]);
-	q1 = _mm_sub_pd(q1, y1);
-	q2 = _mm_sub_pd(q2, y2);
-	q3 = _mm_sub_pd(q3, y3);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4));
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3));
-
-	_mm_store_pd(&q[ldq*2],q1);
-	_mm_store_pd(&q[(ldq*2)+2],q2);
-	_mm_store_pd(&q[(ldq*2)+4],q3);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-	q2 = _mm_load_pd(&q[(ldq*3)+2]);
-	q3 = _mm_load_pd(&q[(ldq*3)+4]);
-	q1 = _mm_sub_pd(q1, x1);
-	q2 = _mm_sub_pd(q2, x2);
-	q3 = _mm_sub_pd(q3, x3);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3));
-	_mm_store_pd(&q[ldq*3], q1);
-	_mm_store_pd(&q[(ldq*3)+2], q2);
-	_mm_store_pd(&q[(ldq*3)+4], q3);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		q3 = _mm_load_pd(&q[(i*ldq)+4]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(x1,h1));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(x2,h1));
-		q3 = _mm_sub_pd(q3, _mm_mul_pd(x3,h1));
-
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(y1,h2));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(y2,h2));
-		q3 = _mm_sub_pd(q3, _mm_mul_pd(y3,h2));
-
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(z1,h3));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(z2,h3));
-		q3 = _mm_sub_pd(q3, _mm_mul_pd(z3,h3));
-
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(w1,h4));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(w2,h4));
-		q3 = _mm_sub_pd(q3, _mm_mul_pd(w3,h4));
-
-		_mm_store_pd(&q[i*ldq],q1);
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-		_mm_store_pd(&q[(i*ldq)+4],q3);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-	q3 = _mm_load_pd(&q[(nb*ldq)+4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3));
-
-	_mm_store_pd(&q[nb*ldq],q1);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-	_mm_store_pd(&q[(nb*ldq)+4],q3);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-	q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2));
-
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-	_mm_store_pd(&q[((nb+1)*ldq)+2],q2);
-	_mm_store_pd(&q[((nb+1)*ldq)+4],q3);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-	q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-	q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1));
-
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-	_mm_store_pd(&q[((nb+2)*ldq)+2],q2);
-	_mm_store_pd(&q[((nb+2)*ldq)+4],q3);
-}
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq]);
-	__m128d a4_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-	__m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-	__m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	__m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-	__m128d x1 = a1_1;
-
-	__m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]);
-	__m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]);
-	__m128d a3_2 = _mm_load_pd(&q[ldq+2]);
-	__m128d a4_2 = _mm_load_pd(&q[0+2]);
-
-	__m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1));
-	__m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1));
-	__m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1));
-	__m128d x2 = a1_2;
-
-	__m128d q1;
-	__m128d q2;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-		z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-		w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]);
-
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [4 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-
-	__m128d vs_1_2 = _mm_loaddup_pd(&s_1_2);
-	__m128d vs_1_3 = _mm_loaddup_pd(&s_1_3);
-	__m128d vs_2_3 = _mm_loaddup_pd(&s_2_3);
-	__m128d vs_1_4 = _mm_loaddup_pd(&s_1_4);
-	__m128d vs_2_4 = _mm_loaddup_pd(&s_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&s_3_4);
-
-	h1 = tau1;
-	x1 = _mm_mul_pd(x1, h1);
-	x2 = _mm_mul_pd(x2, h1);
-
-	h1 = tau2;
-	h2 = _mm_mul_pd(h1, vs_1_2);
-
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-	y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2));
-
-	h1 = tau3;
-	h2 = _mm_mul_pd(h1, vs_1_3);
-	h3 = _mm_mul_pd(h1, vs_2_3);
-
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-	z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)));
-
-	h1 = tau4;
-	h2 = _mm_mul_pd(h1, vs_1_4);
-	h3 = _mm_mul_pd(h1, vs_2_4);
-	h4 = _mm_mul_pd(h1, vs_3_4);
-
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-	w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));
-
-	q1 = _mm_load_pd(&q[0]);
-	q2 = _mm_load_pd(&q[2]);
-	q1 = _mm_sub_pd(q1, w1);
-	q2 = _mm_sub_pd(q2, w2);
-	_mm_store_pd(&q[0],q1);
-	_mm_store_pd(&q[2],q2);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-	q2 = _mm_load_pd(&q[ldq+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4)));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4)));
-
-	_mm_store_pd(&q[ldq],q1);
-	_mm_store_pd(&q[ldq+2],q2);
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-	q2 = _mm_load_pd(&q[(ldq*2)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4))));
-	_mm_store_pd(&q[ldq*2],q1);
-	_mm_store_pd(&q[(ldq*2)+2],q2);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-	q2 = _mm_load_pd(&q[(ldq*3)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_add_pd(_mm_mul_pd(y2, h2), _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4)))));
-
-	_mm_store_pd(&q[ldq*3], q1);
-	_mm_store_pd(&q[(ldq*3)+2], q2);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-
-		q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))));
-
-		_mm_store_pd(&q[i*ldq],q1);
-
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-
-		q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2, h4), _mm_mul_pd(z2, h3)), _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))));
-
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1)));
-	q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(y2, h2)) , _mm_mul_pd(x2, h1)));
-
-	_mm_store_pd(&q[nb*ldq],q1);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1)));
-	q2 = _mm_sub_pd(q2, _mm_add_pd( _mm_mul_pd(y2, h2) , _mm_mul_pd(x2, h1)));
-
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-	_mm_store_pd(&q[((nb+1)*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-	_mm_store_pd(&q[((nb+2)*ldq)+2],q2);
-}
-
-/**
- * Unrolled kernel that computes
- * 2 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [2 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq]);
-	__m128d a4_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-	__m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-	__m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	__m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-	__m128d x1 = a1_1;
-
-	__m128d q1;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-
-	for(i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [2 x nb+3]
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-
-	__m128d vs_1_2 = _mm_loaddup_pd(&s_1_2);
-	__m128d vs_1_3 = _mm_loaddup_pd(&s_1_3);
-	__m128d vs_2_3 = _mm_loaddup_pd(&s_2_3);
-	__m128d vs_1_4 = _mm_loaddup_pd(&s_1_4);
-	__m128d vs_2_4 = _mm_loaddup_pd(&s_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&s_3_4);
-
-	h1 = tau1;
-	x1 = _mm_mul_pd(x1, h1);
-
-	h1 = tau2;
-	h2 = _mm_mul_pd(h1, vs_1_2);
-
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2));
-
-	h1 = tau3;
-	h2 = _mm_mul_pd(h1, vs_1_3);
-	h3 = _mm_mul_pd(h1, vs_2_3);
-
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-
-	h1 = tau4;
-	h2 = _mm_mul_pd(h1, vs_1_4);
-	h3 = _mm_mul_pd(h1, vs_2_4);
-	h4 = _mm_mul_pd(h1, vs_3_4);
-
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-
-	q1 = _mm_load_pd(&q[0]);
-	q1 = _mm_sub_pd(q1, w1);
-	_mm_store_pd(&q[0],q1);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4)));
-
-	_mm_store_pd(&q[ldq],q1);
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))));
-
-	_mm_store_pd(&q[ldq*2],q1);
-
-	h2 = _mm_loaddup_pd(&hh[ldh+1]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))));
-
-	_mm_store_pd(&q[ldq*3], q1);
-
-	for (i = 4; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-3]);
-		h2 = _mm_loaddup_pd(&hh[ldh+i-2]);
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]);
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]);
-
-		q1 = _mm_load_pd(&q[i*ldq]);
-
-		q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))));
-
-		_mm_store_pd(&q[i*ldq],q1);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1)));
-
-	_mm_store_pd(&q[nb*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1)));
-
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-}
diff -Nru elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c
--- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1353 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//    This particular source code file contains additions, changes and
-//    enhancements authored by Intel Corporation which is not part of
-//    the ELPA consortium.
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-//
-// This file contains the compute intensive kernels for the Householder transformations.
-// It should be compiled with the highest possible optimization level.
-//
-// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
-// On Intel Sandy Bridge use -O3 -mavx
-//
-// Copyright of the original code rests with the authors inside the ELPA
-// consortium. The copyright of any additional modifications shall rest
-// with their original authors, but shall adhere to the licensing terms
-// distributed along with the original code in the file "COPYING".
-//
-// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
-// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
-// --------------------------------------------------------------------------------------------------
-
-#include "config-f90.h"
-
-#include <x86intrin.h>
-
-#define __forceinline __attribute__((always_inline)) static
-
-#ifdef HAVE_SSE_INTRINSICS
-#undef __AVX__
-#endif
-
-//Forward declaration
-static void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
-static void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
-
-/*
-!f>#ifdef HAVE_SSE_INTRINSICS
-!f> interface
-!f>   subroutine hexa_hh_trafo_real_sse_6hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="hexa_hh_trafo_real_sse_6hv")
-!f>     use, intrinsic :: iso_c_binding
-!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
-!f>     real(kind=c_double)     :: q(*)
-!f>     real(kind=c_double)     :: hh(pnb,6)
-!f>   end subroutine
-!f> end interface
-!f>#endif
-*/
-
-void hexa_hh_trafo_real_sse_6hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
-
-void hexa_hh_trafo_real_sse_6hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 6 householder vectors simultaneously
-	double scalarprods[15];
-
-//	scalarprods[0] = s_1_2;
-//	scalarprods[1] = s_1_3;
-//	scalarprods[2] = s_2_3;
-//	scalarprods[3] = s_1_4;
-//	scalarprods[4] = s_2_4;
-//	scalarprods[5] = s_3_4;
-//	scalarprods[6] = s_1_5;
-//	scalarprods[7] = s_2_5;
-//	scalarprods[8] = s_3_5;
-//	scalarprods[9] = s_4_5;
-//	scalarprods[10] = s_1_6;
-//	scalarprods[11] = s_2_6;
-//	scalarprods[12] = s_3_6;
-//	scalarprods[13] = s_4_6;
-//	scalarprods[14] = s_5_6;
-
-	scalarprods[0] = hh[(ldh+1)];
-	scalarprods[1] = hh[(ldh*2)+2];
-	scalarprods[2] = hh[(ldh*2)+1];
-	scalarprods[3] = hh[(ldh*3)+3];
-	scalarprods[4] = hh[(ldh*3)+2];
-	scalarprods[5] = hh[(ldh*3)+1];
-	scalarprods[6] = hh[(ldh*4)+4];
-	scalarprods[7] = hh[(ldh*4)+3];
-	scalarprods[8] = hh[(ldh*4)+2];
-	scalarprods[9] = hh[(ldh*4)+1];
-	scalarprods[10] = hh[(ldh*5)+5];
-	scalarprods[11] = hh[(ldh*5)+4];
-	scalarprods[12] = hh[(ldh*5)+3];
-	scalarprods[13] = hh[(ldh*5)+2];
-	scalarprods[14] = hh[(ldh*5)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	scalarprods[0] += hh[1] * hh[(2+ldh)];
-	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
-
-	// loop counter = 3
-	scalarprods[0] += hh[2] * hh[(3+ldh)];
-	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
-
-	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
-
-	// loop counter = 4
-	scalarprods[0] += hh[3] * hh[(4+ldh)];
-	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
-
-	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
-
-	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
-
-	// loop counter = 5
-	scalarprods[0] += hh[4] * hh[(5+ldh)];
-	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
-
-	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
-
-	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
-
-	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
-	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
-
-	#pragma ivdep
-	for (i = 6; i < nb; i++)
-	{
-		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
-		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
-		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
-
-		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
-		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
-		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
-
-		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
-		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
-		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
-
-		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
-		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
-
-		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
-	}
-
-//	printf("s_1_2: %f\n", scalarprods[0]);
-//	printf("s_1_3: %f\n", scalarprods[1]);
-//	printf("s_2_3: %f\n", scalarprods[2]);
-//	printf("s_1_4: %f\n", scalarprods[3]);
-//	printf("s_2_4: %f\n", scalarprods[4]);
-//	printf("s_3_4: %f\n", scalarprods[5]);
-//	printf("s_1_5: %f\n", scalarprods[6]);
-//	printf("s_2_5: %f\n", scalarprods[7]);
-//	printf("s_3_5: %f\n", scalarprods[8]);
-//	printf("s_4_5: %f\n", scalarprods[9]);
-//	printf("s_1_6: %f\n", scalarprods[10]);
-//	printf("s_2_6: %f\n", scalarprods[11]);
-//	printf("s_3_6: %f\n", scalarprods[12]);
-//	printf("s_4_6: %f\n", scalarprods[13]);
-//	printf("s_5_6: %f\n", scalarprods[14]);
-
-	// Production level kernel calls with padding
-	for (i = 0; i < nq-2; i+=4)
-	{
-		hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-	if (nq == i)
-	{
-		return;
-	}
-	else
-	{
-		hh_trafo_kernel_2_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-}
-
-#if 0
-void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
-{
-	int i;
-	int nb = *pnb;
-	int nq = *pldq;
-	int ldq = *pldq;
-	int ldh = *pldh;
-
-	// calculating scalar products to compute
-	// 6 householder vectors simultaneously
-	double scalarprods[15];
-
-//	scalarprods[0] = s_1_2;
-//	scalarprods[1] = s_1_3;
-//	scalarprods[2] = s_2_3;
-//	scalarprods[3] = s_1_4;
-//	scalarprods[4] = s_2_4;
-//	scalarprods[5] = s_3_4;
-//	scalarprods[6] = s_1_5;
-//	scalarprods[7] = s_2_5;
-//	scalarprods[8] = s_3_5;
-//	scalarprods[9] = s_4_5;
-//	scalarprods[10] = s_1_6;
-//	scalarprods[11] = s_2_6;
-//	scalarprods[12] = s_3_6;
-//	scalarprods[13] = s_4_6;
-//	scalarprods[14] = s_5_6;
-
-	scalarprods[0] = hh[(ldh+1)];
-	scalarprods[1] = hh[(ldh*2)+2];
-	scalarprods[2] = hh[(ldh*2)+1];
-	scalarprods[3] = hh[(ldh*3)+3];
-	scalarprods[4] = hh[(ldh*3)+2];
-	scalarprods[5] = hh[(ldh*3)+1];
-	scalarprods[6] = hh[(ldh*4)+4];
-	scalarprods[7] = hh[(ldh*4)+3];
-	scalarprods[8] = hh[(ldh*4)+2];
-	scalarprods[9] = hh[(ldh*4)+1];
-	scalarprods[10] = hh[(ldh*5)+5];
-	scalarprods[11] = hh[(ldh*5)+4];
-	scalarprods[12] = hh[(ldh*5)+3];
-	scalarprods[13] = hh[(ldh*5)+2];
-	scalarprods[14] = hh[(ldh*5)+1];
-
-	// calculate scalar product of first and fourth householder vector
-	// loop counter = 2
-	scalarprods[0] += hh[1] * hh[(2+ldh)];
-	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
-
-	// loop counter = 3
-	scalarprods[0] += hh[2] * hh[(3+ldh)];
-	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
-
-	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
-
-	// loop counter = 4
-	scalarprods[0] += hh[3] * hh[(4+ldh)];
-	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
-
-	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
-
-	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
-
-	// loop counter = 5
-	scalarprods[0] += hh[4] * hh[(5+ldh)];
-	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
-	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
-	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
-	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
-
-	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
-	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
-	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
-	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
-
-	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
-	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
-	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
-
-	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
-	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
-
-	#pragma ivdep
-	for (i = 6; i < nb; i++)
-	{
-		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
-		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
-		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
-		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
-		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
-
-		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
-		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
-		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
-		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
-
-		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
-		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
-		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
-
-		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
-		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
-
-		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
-	}
-
-//	printf("s_1_2: %f\n", scalarprods[0]);
-//	printf("s_1_3: %f\n", scalarprods[1]);
-//	printf("s_2_3: %f\n", scalarprods[2]);
-//	printf("s_1_4: %f\n", scalarprods[3]);
-//	printf("s_2_4: %f\n", scalarprods[4]);
-//	printf("s_3_4: %f\n", scalarprods[5]);
-//	printf("s_1_5: %f\n", scalarprods[6]);
-//	printf("s_2_5: %f\n", scalarprods[7]);
-//	printf("s_3_5: %f\n", scalarprods[8]);
-//	printf("s_4_5: %f\n", scalarprods[9]);
-//	printf("s_1_6: %f\n", scalarprods[10]);
-//	printf("s_2_6: %f\n", scalarprods[11]);
-//	printf("s_3_6: %f\n", scalarprods[12]);
-//	printf("s_4_6: %f\n", scalarprods[13]);
-//	printf("s_5_6: %f\n", scalarprods[14]);
-
-	// Production level kernel calls with padding
-#ifdef __AVX__
-	for (i = 0; i < nq; i+=8)
-	{
-		hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#else
-	for (i = 0; i < nq; i+=4)
-	{
-		hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods);
-	}
-#endif
-}
-#endif
-
-/**
- * Unrolled kernel that computes
- * 4 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*5]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*4]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a4_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a5_1 = _mm_load_pd(&q[ldq]);
-	__m128d a6_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
-	__m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
-	__m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
-	__m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
-	__m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
-
-	register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1));
-
-	__m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
-	__m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
-	__m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
-	__m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
-
-	register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1));
-
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-	register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-
-	register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-
-	register __m128d x1 = a1_1;
-
-	__m128d a1_2 = _mm_load_pd(&q[(ldq*5)+2]);
-	__m128d a2_2 = _mm_load_pd(&q[(ldq*4)+2]);
-	__m128d a3_2 = _mm_load_pd(&q[(ldq*3)+2]);
-	__m128d a4_2 = _mm_load_pd(&q[(ldq*2)+2]);
-	__m128d a5_2 = _mm_load_pd(&q[(ldq)+2]);
-	__m128d a6_2 = _mm_load_pd(&q[2]);
-
-	register __m128d t2 = _mm_add_pd(a6_2, _mm_mul_pd(a5_2, h_6_5));
-	t2 = _mm_add_pd(t2, _mm_mul_pd(a4_2, h_6_4));
-	t2 = _mm_add_pd(t2, _mm_mul_pd(a3_2, h_6_3));
-	t2 = _mm_add_pd(t2, _mm_mul_pd(a2_2, h_6_2));
-	t2 = _mm_add_pd(t2, _mm_mul_pd(a1_2, h_6_1));
-	register __m128d v2 = _mm_add_pd(a5_2, _mm_mul_pd(a4_2, h_5_4));
-	v2 = _mm_add_pd(v2, _mm_mul_pd(a3_2, h_5_3));
-	v2 = _mm_add_pd(v2, _mm_mul_pd(a2_2, h_5_2));
-	v2 = _mm_add_pd(v2, _mm_mul_pd(a1_2, h_5_1));
-	register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1));
-	register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1));
-	register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1));
-
-	register __m128d x2 = a1_2;
-
-	__m128d q1;
-	__m128d q2;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-	__m128d h5;
-	__m128d h6;
-
-	for(i = 6; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-5]);
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-
-		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
-
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
-
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-		z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
-
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-		w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-
-		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
-
-		v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
-		v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5));
-
-		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
-
-		t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6));
-		t2 = _mm_add_pd(t2, _mm_mul_pd(q2,h6));
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-5]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
-
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
-
-	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
-
-	v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
-	v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5));
-
-	h1 = _mm_loaddup_pd(&hh[nb-4]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
-
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
-
-	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-	w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
-
-	/////////////////////////////////////////////////////
-	// Apply tau, correct wrong calculation using pre-calculated scalar products
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-	x1 = _mm_mul_pd(x1, tau1);
-	x2 = _mm_mul_pd(x2, tau1);
-
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]);
-	h2 = _mm_mul_pd(tau2, vs_1_2);
-
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2));
-	y2 = _mm_sub_pd(_mm_mul_pd(y2,tau2), _mm_mul_pd(x2,h2));
-
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]);
-	__m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]);
-	h2 = _mm_mul_pd(tau3, vs_1_3);
-	h3 = _mm_mul_pd(tau3, vs_2_3);
-
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-	z2 = _mm_sub_pd(_mm_mul_pd(z2,tau3), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)));
-
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-	__m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]);
-	__m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]);
-	h2 = _mm_mul_pd(tau4, vs_1_4);
-	h3 = _mm_mul_pd(tau4, vs_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]);
-	h4 = _mm_mul_pd(tau4, vs_3_4);
-
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-	w2 = _mm_sub_pd(_mm_mul_pd(w2,tau4), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));
-
-	__m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]);
-	__m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]);
-	__m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]);
-	h2 = _mm_mul_pd(tau5, vs_1_5);
-	h3 = _mm_mul_pd(tau5, vs_2_5);
-	__m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]);
-	__m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]);
-	h4 = _mm_mul_pd(tau5, vs_3_5);
-	h5 = _mm_mul_pd(tau5, vs_4_5);
-
-	v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-	v2 = _mm_sub_pd(_mm_mul_pd(v2,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));
-
-	__m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]);
-	__m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]);
-	__m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]);
-	h2 = _mm_mul_pd(tau6, vs_1_6);
-	h3 = _mm_mul_pd(tau6, vs_2_6);
-	__m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]);
-	__m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]);
-	__m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]);
-	h4 = _mm_mul_pd(tau6, vs_3_6);
-	h5 = _mm_mul_pd(tau6, vs_4_6);
-	h6 = _mm_mul_pd(tau6, vs_5_6);
-
-	t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))));
-	t2 = _mm_sub_pd(_mm_mul_pd(t2,tau6), _mm_add_pd( _mm_mul_pd(v2,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))));
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [4 x nb+3]
-	/////////////////////////////////////////////////////
-
-	q1 = _mm_load_pd(&q[0]);
-	q2 = _mm_load_pd(&q[2]);
-	q1 = _mm_sub_pd(q1, t1);
-	q2 = _mm_sub_pd(q2, t2);
-	_mm_store_pd(&q[0],q1);
-	_mm_store_pd(&q[2],q2);
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-	q2 = _mm_load_pd(&q[(ldq+2)]);
-	q1 = _mm_sub_pd(q1, v1);
-	q2 = _mm_sub_pd(q2, v2);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-
-	_mm_store_pd(&q[ldq],q1);
-	_mm_store_pd(&q[(ldq+2)],q2);
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-	q2 = _mm_load_pd(&q[(ldq*2)+2]);
-	q1 = _mm_sub_pd(q1, w1);
-	q2 = _mm_sub_pd(q2, w2);
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-
-	_mm_store_pd(&q[ldq*2],q1);
-	_mm_store_pd(&q[(ldq*2)+2],q2);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-	q2 = _mm_load_pd(&q[(ldq*3)+2]);
-	q1 = _mm_sub_pd(q1, z1);
-	q2 = _mm_sub_pd(q2, z2);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-
-	_mm_store_pd(&q[ldq*3],q1);
-	_mm_store_pd(&q[(ldq*3)+2],q2);
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	q1 = _mm_load_pd(&q[ldq*4]);
-	q2 = _mm_load_pd(&q[(ldq*4)+2]);
-	q1 = _mm_sub_pd(q1, y1);
-	q2 = _mm_sub_pd(q2, y2);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-
-	_mm_store_pd(&q[ldq*4],q1);
-	_mm_store_pd(&q[(ldq*4)+2],q2);
-
-	h2 = _mm_loaddup_pd(&hh[(ldh)+1]);
-	q1 = _mm_load_pd(&q[ldq*5]);
-	q2 = _mm_load_pd(&q[(ldq*5)+2]);
-	q1 = _mm_sub_pd(q1, x1);
-	q2 = _mm_sub_pd(q2, x2);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-
-	_mm_store_pd(&q[ldq*5],q1);
-	_mm_store_pd(&q[(ldq*5)+2],q2);
-
-	for (i = 6; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q[i*ldq]);
-		q2 = _mm_load_pd(&q[(i*ldq)+2]);
-		h1 = _mm_loaddup_pd(&hh[i-5]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-
-		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-
-		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-
-		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-		q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
-
-		_mm_store_pd(&q[i*ldq],q1);
-		_mm_store_pd(&q[(i*ldq)+2],q2);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-5]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
-
-	_mm_store_pd(&q[nb*ldq],q1);
-	_mm_store_pd(&q[(nb*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-4]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
-
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-	_mm_store_pd(&q[((nb+1)*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
-
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-	_mm_store_pd(&q[((nb+2)*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
-
-	_mm_store_pd(&q[(nb+3)*ldq],q1);
-	_mm_store_pd(&q[((nb+3)*ldq)+2],q2);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
-	q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
-
-	_mm_store_pd(&q[(nb+4)*ldq],q1);
-	_mm_store_pd(&q[((nb+4)*ldq)+2],q2);
-}
-
-/**
- * Unrolled kernel that computes
- * 2 rows of Q simultaneously, a
- * matrix vector product with two householder
- * vectors + a rank 1 update is performed
- */
-__forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
-{
-	/////////////////////////////////////////////////////
-	// Matrix Vector Multiplication, Q [2 x nb+3] * hh
-	// hh contains four householder vectors
-	/////////////////////////////////////////////////////
-	int i;
-
-	__m128d a1_1 = _mm_load_pd(&q[ldq*5]);
-	__m128d a2_1 = _mm_load_pd(&q[ldq*4]);
-	__m128d a3_1 = _mm_load_pd(&q[ldq*3]);
-	__m128d a4_1 = _mm_load_pd(&q[ldq*2]);
-	__m128d a5_1 = _mm_load_pd(&q[ldq]);
-	__m128d a6_1 = _mm_load_pd(&q[0]);
-
-	__m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
-	__m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
-	__m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
-	__m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
-	__m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
-
-	register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2));
-	t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1));
-
-	__m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
-	__m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
-	__m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
-	__m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
-
-	register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2));
-	v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1));
-
-	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-	register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
-	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
-
-	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
-	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-
-	register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
-	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
-	register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
-
-	register __m128d x1 = a1_1;
-
-	__m128d q1;
-
-	__m128d h1;
-	__m128d h2;
-	__m128d h3;
-	__m128d h4;
-	__m128d h5;
-	__m128d h6;
-
-	for(i = 6; i < nb; i++)
-	{
-		h1 = _mm_loaddup_pd(&hh[i-5]);
-		q1 = _mm_load_pd(&q[i*ldq]);
-
-		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-
-		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
-
-		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
-
-		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
-
-		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-
-		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
-
-		v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
-
-		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
-
-		t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6));
-
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-5]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
-
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
-
-	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
-
-	v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
-
-
-	h1 = _mm_loaddup_pd(&hh[nb-4]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
-
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
-
-	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-
-	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-
-	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
-
-	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
-
-	/////////////////////////////////////////////////////
-	// Apply tau, correct wrong calculation using pre-calculated scalar products
-	/////////////////////////////////////////////////////
-
-	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
-	x1 = _mm_mul_pd(x1, tau1);
-
-	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
-	__m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]);
-	h2 = _mm_mul_pd(tau2, vs_1_2);
-
-	y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2));
-
-	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
-	__m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]);
-	__m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]);
-	h2 = _mm_mul_pd(tau3, vs_1_3);
-	h3 = _mm_mul_pd(tau3, vs_2_3);
-
-	z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
-
-	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
-	__m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]);
-	__m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]);
-	h2 = _mm_mul_pd(tau4, vs_1_4);
-	h3 = _mm_mul_pd(tau4, vs_2_4);
-	__m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]);
-	h4 = _mm_mul_pd(tau4, vs_3_4);
-
-	w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-
-	__m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]);
-	__m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]);
-	__m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]);
-	h2 = _mm_mul_pd(tau5, vs_1_5);
-	h3 = _mm_mul_pd(tau5, vs_2_5);
-	__m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]);
-	__m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]);
-	h4 = _mm_mul_pd(tau5, vs_3_5);
-	h5 = _mm_mul_pd(tau5, vs_4_5);
-
-	v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
-
-	__m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]);
-	__m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]);
-	__m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]);
-	h2 = _mm_mul_pd(tau6, vs_1_6);
-	h3 = _mm_mul_pd(tau6, vs_2_6);
-	__m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]);
-	__m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]);
-	__m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]);
-	h4 = _mm_mul_pd(tau6, vs_3_6);
-	h5 = _mm_mul_pd(tau6, vs_4_6);
-	h6 = _mm_mul_pd(tau6, vs_5_6);
-
-	t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))));
-
-	/////////////////////////////////////////////////////
-	// Rank-1 update of Q [2 x nb+3]
-	/////////////////////////////////////////////////////
-
-	q1 = _mm_load_pd(&q[0]);
-	q1 = _mm_sub_pd(q1, t1);
-	_mm_store_pd(&q[0],q1);
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
-	q1 = _mm_load_pd(&q[ldq]);
-	q1 = _mm_sub_pd(q1, v1);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-
-	_mm_store_pd(&q[ldq],q1);
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
-	q1 = _mm_load_pd(&q[ldq*2]);
-	q1 = _mm_sub_pd(q1, w1);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-
-	_mm_store_pd(&q[ldq*2],q1);
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
-	q1 = _mm_load_pd(&q[ldq*3]);
-	q1 = _mm_sub_pd(q1, z1);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-
-	_mm_store_pd(&q[ldq*3],q1);
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
-	q1 = _mm_load_pd(&q[ldq*4]);
-	q1 = _mm_sub_pd(q1, y1);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-
-	_mm_store_pd(&q[ldq*4],q1);
-
-	h2 = _mm_loaddup_pd(&hh[(ldh)+1]);
-	q1 = _mm_load_pd(&q[ldq*5]);
-	q1 = _mm_sub_pd(q1, x1);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-
-	h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-
-	_mm_store_pd(&q[ldq*5],q1);
-
-	for (i = 6; i < nb; i++)
-	{
-		q1 = _mm_load_pd(&q[i*ldq]);
-		h1 = _mm_loaddup_pd(&hh[i-5]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-
-		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-
-		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-
-		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-
-		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-
-		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
-
-		q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
-
-		_mm_store_pd(&q[i*ldq],q1);
-	}
-
-	h1 = _mm_loaddup_pd(&hh[nb-5]);
-	q1 = _mm_load_pd(&q[nb*ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-
-	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
-
-	_mm_store_pd(&q[nb*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-4]);
-	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-
-	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
-
-	_mm_store_pd(&q[(nb+1)*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-3]);
-	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-
-	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
-
-	_mm_store_pd(&q[(nb+2)*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-2]);
-	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-
-	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
-
-	_mm_store_pd(&q[(nb+3)*ldq],q1);
-
-	h1 = _mm_loaddup_pd(&hh[nb-1]);
-	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
-
-	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
-
-	_mm_store_pd(&q[(nb+4)*ldq],q1);
-}
diff -Nru elpa-2016.05.001/src/elpa2_kernels/mod_fortran_interfaces.F90 elpa-2019.11.001/src/elpa2_kernels/mod_fortran_interfaces.F90
--- elpa-2016.05.001/src/elpa2_kernels/mod_fortran_interfaces.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/mod_fortran_interfaces.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,9 +0,0 @@
-#include "config-f90.h"
-
-module kernel_interfaces
-
-  implicit none
-
-#include "elpa/elpa_generated_fortran_interfaces.h"
-
-end module
diff -Nru elpa-2016.05.001/src/elpa2_kernels/mod_single_hh_trafo_real.F90 elpa-2019.11.001/src/elpa2_kernels/mod_single_hh_trafo_real.F90
--- elpa-2016.05.001/src/elpa2_kernels/mod_single_hh_trafo_real.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_kernels/mod_single_hh_trafo_real.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,67 +0,0 @@
-module single_hh_trafo_real
-  implicit none
-#include "config-f90.h"
-
-#ifdef WITH_OPENMP
-  public single_hh_trafo_real_cpu_openmp
-#else
-  public single_hh_trafo_real_cpu
-#endif
-  contains
-
-#ifdef WITH_OPENMP
-    subroutine single_hh_trafo_real_cpu_openmp(q, hh, nb, nq, ldq)
-#else
-    subroutine single_hh_trafo_real_cpu(q, hh, nb, nq, ldq)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      use precision
-      ! Perform single real Householder transformation.
-      ! This routine is not performance critical and thus it is coded here in Fortran
-
-      implicit none
-      integer(kind=ik), intent(in)   :: nb, nq, ldq
-!      real(kind=rk), intent(inout)   :: q(ldq, *)
-!      real(kind=rk), intent(in)      :: hh(*)
-      real(kind=rk), intent(inout)   :: q(1:ldq, 1:nb)
-      real(kind=rk), intent(in)      :: hh(1:nb)
-      integer(kind=ik)               :: i
-      real(kind=rk)                  :: v(nq)
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-      call timer%start("single_hh_trafo_real_cpu_openmp")
-#else
-      call timer%start("single_hh_trafo_real_cpu")
-#endif
-#endif
-
-      ! v = q * hh
-      v(:) = q(1:nq,1)
-      do i=2,nb
-        v(:) = v(:) + q(1:nq,i) * hh(i)
-      enddo
-
-      ! v = v * tau
-      v(:) = v(:) * hh(1)
-
-      ! q = q - v * hh**T
-      q(1:nq,1) = q(1:nq,1) - v(:)
-      do i=2,nb
-        q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
-      enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-      call timer%stop("single_hh_trafo_real_cpu_openmp")
-#else
-      call timer%stop("single_hh_trafo_real_cpu")
-#endif
-#endif
-    end subroutine
-
-
-end module
diff -Nru elpa-2016.05.001/src/elpa2_print_kernels.F90 elpa-2019.11.001/src/elpa2_print_kernels.F90
--- elpa-2016.05.001/src/elpa2_print_kernels.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_print_kernels.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,132 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-#include "config-f90.h"
-
-!> \file print_available_elpa2_kernels.F90
-!> \par
-!> \brief Provide information which ELPA2 kernels are available on this system
-!>
-!> \details
-!> It is possible to configure ELPA2 such, that different compute intensive
-!> "ELPA2 kernels" can be choosen at runtime.
-!> The service binary print_available_elpa2_kernels will query the library and tell
-!> whether ELPA2 has been configured in this way, and if this is the case which kernels can be
-!> choosen at runtime.
-!> It will furthermore detail whether ELPA has been configured with OpenMP support
-!>
-!> Synopsis: print_available_elpa2_kernels
-!>
-!> \author A. Marek (MPCDF)
-program print_available_elpa2_kernels
-
-   use precision
-   use ELPA1
-   use ELPA2
-
-   use elpa2_utilities
-
-   implicit none
-
-   integer(kind=ik) :: i
-
-   print *, "This program will give information on the ELPA2 kernels, "
-   print *, "which are available with this library and it will give "
-   print *, "information if (and how) the kernels can be choosen at "
-   print *, "runtime"
-   print *
-   print *
-#ifdef WITH_OPENMP
-   print *, " ELPA supports threads: yes"
-#else
-   print *, " ELPA supports threads: no"
-#endif
-
-   print *, "Information on ELPA2 real case: "
-   print *, "=============================== "
-#ifdef HAVE_ENVIRONMENT_CHECKING
-   print *, " choice via environment variable: yes"
-   print *, " environment variable name      : REAL_ELPA_KERNEL"
-#else
-   print *, " choice via environment variable: no"
-#endif
-   print *
-   print *, " Available real kernels are: "
-#ifdef HAVE_AVX2
-   print *, " AVX kernels are optimized for FMA (AVX2)"
-#endif
-   call print_available_real_kernels()
-
-   print *
-   print *
-   print *, "Information on ELPA2 complex case: "
-   print *, "=============================== "
-#ifdef HAVE_ENVIRONMENT_CHECKING
-   print *, " choice via environment variable: yes"
-   print *, " environment variable name      : COMPLEX_ELPA_KERNEL"
-#else
-   print *,  " choice via environment variable: no"
-#endif
-   print *
-   print *, " Available complex kernels are: "
-#ifdef HAVE_AVX2
-   print *, " AVX kernels are optimized for FMA (AVX2)"
-#endif
-   call print_available_complex_kernels()
-
-end program print_available_elpa2_kernels
diff -Nru elpa-2016.05.001/src/elpa2_utilities.F90 elpa-2019.11.001/src/elpa2_utilities.F90
--- elpa-2016.05.001/src/elpa2_utilities.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa2_utilities.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,842 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-
-! ELPA2 -- 2-stage solver for ELPA
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-
-
-#include "config-f90.h"
-#include <elpa/elpa_kernel_constants.h>
-
-module ELPA2_utilities
-  use ELPA_utilities
-  implicit none
-
-  PRIVATE ! By default, all routines contained are private
-
-  ! The following routines are public:
-
-  public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
-  public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
-            REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ,                &
-            REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_SSE_BLOCK2,         &
-            REAL_ELPA_KERNEL_SSE_BLOCK4, REAL_ELPA_KERNEL_SSE_BLOCK6,  &
-            REAL_ELPA_KERNEL_AVX_BLOCK2,                               &
-            REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6,  &
-            REAL_ELPA_KERNEL_AVX2_BLOCK2,                              &
-            REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6,&
-            DEFAULT_REAL_ELPA_KERNEL
-
-  public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
-            COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ,                &
-            COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1,         &
-            COMPLEX_ELPA_KERNEL_SSE_BLOCK2,                                  &
-            COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2,   &
-            COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, &
-            DEFAULT_COMPLEX_ELPA_KERNEL
-
-  public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
-
-  public :: get_actual_complex_kernel, get_actual_real_kernel
-
-  public :: check_allowed_complex_kernels, check_allowed_real_kernels
-
-  public :: AVAILABLE_COMPLEX_ELPA_KERNELS, AVAILABLE_REAL_ELPA_KERNELS
-
-  public :: print_available_real_kernels, print_available_complex_kernels
-  public :: query_available_real_kernels, query_available_complex_kernels
-
-  public :: qr_decomposition_via_environment_variable
-
-  integer, parameter :: number_of_real_kernels           = ELPA2_NUMBER_OF_REAL_KERNELS
-  integer, parameter :: REAL_ELPA_KERNEL_GENERIC         = ELPA2_REAL_KERNEL_GENERIC
-  integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE  = ELPA2_REAL_KERNEL_GENERIC_SIMPLE
-  integer, parameter :: REAL_ELPA_KERNEL_BGP             = ELPA2_REAL_KERNEL_BGP
-  integer, parameter :: REAL_ELPA_KERNEL_BGQ             = ELPA2_REAL_KERNEL_BGQ
-  integer, parameter :: REAL_ELPA_KERNEL_SSE             = ELPA2_REAL_KERNEL_SSE
-  integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK2      = ELPA2_REAL_KERNEL_SSE_BLOCK2
-  integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK4      = ELPA2_REAL_KERNEL_SSE_BLOCK4
-  integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK6      = ELPA2_REAL_KERNEL_SSE_BLOCK6
-  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2      = ELPA2_REAL_KERNEL_AVX_BLOCK2
-  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4      = ELPA2_REAL_KERNEL_AVX_BLOCK4
-  integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6      = ELPA2_REAL_KERNEL_AVX_BLOCK6
-  integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK2     = ELPA2_REAL_KERNEL_AVX2_BLOCK2
-  integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK4     = ELPA2_REAL_KERNEL_AVX2_BLOCK4
-  integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6     = ELPA2_REAL_KERNEL_AVX2_BLOCK6
-
-#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
-
-#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
-#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
-
-#ifdef WITH_REAL_GENERIC_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
-#endif
-#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
-#endif
-#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
-#endif
-#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
-
-#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
-#else
-
-#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
-#else
-#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
-#endif
-#endif
-#endif
-#endif /*  #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
-
-#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
-#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
-#else
-#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
-#else
-#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
-#endif
-#endif
-#endif
-#endif /*  #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
-
-#ifdef WITH_REAL_BGP_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
-#endif
-#ifdef WITH_REAL_BGQ_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
-#endif
-
-#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
-
-#else /* WITH_REAL_AVX_BLOCK2_KERNEL */
-
-#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
-#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
-
-#ifdef WITH_REAL_GENERIC_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
-#endif
-#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
-#endif
-#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
-#endif
-
-#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
-#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6
-#else
-#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4
-#else
-#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2
-#endif
-#endif
-#endif
-#endif /*  #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
-
-#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
-#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
-#else
-#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
-#else
-#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
-#endif
-#endif
-#endif
-#endif /*  #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
-
-#ifdef WITH_REAL_BGP_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
-#endif
-#ifdef WITH_REAL_BGQ_KERNEL
-  integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
-#endif
-
-#endif  /* WITH_ONE_SPECIFIC_REAL_KERNEL */
-
-#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
-
-  character(35), parameter, dimension(number_of_real_kernels) :: &
-  REAL_ELPA_KERNEL_NAMES =    (/"REAL_ELPA_KERNEL_GENERIC         ", &
-                                "REAL_ELPA_KERNEL_GENERIC_SIMPLE  ", &
-                                "REAL_ELPA_KERNEL_BGP             ", &
-                                "REAL_ELPA_KERNEL_BGQ             ", &
-                                "REAL_ELPA_KERNEL_SSE             ", &
-                                "REAL_ELPA_KERNEL_SSE_BLOCK2      ", &
-                                "REAL_ELPA_KERNEL_SSE_BLOCK4      ", &
-                                "REAL_ELPA_KERNEL_SSE_BLOCK6      ", &
-                                "REAL_ELPA_KERNEL_AVX_BLOCK2      ", &
-                                "REAL_ELPA_KERNEL_AVX_BLOCK4      ", &
-                                "REAL_ELPA_KERNEL_AVX_BLOCK6      ", &
-                                "REAL_ELPA_KERNEL_AVX2_BLOCK2     ", &
-                                "REAL_ELPA_KERNEL_AVX2_BLOCK4     ", &
-                                "REAL_ELPA_KERNEL_AVX2_BLOCK6     "/)
-
-  integer, parameter :: number_of_complex_kernels           = ELPA2_NUMBER_OF_COMPLEX_KERNELS
-  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC         = ELPA2_COMPLEX_KERNEL_GENERIC
-  integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  = ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE
-  integer, parameter :: COMPLEX_ELPA_KERNEL_BGP             = ELPA2_COMPLEX_KERNEL_BGP
-  integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ             = ELPA2_COMPLEX_KERNEL_BGQ
-  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE             = ELPA2_COMPLEX_KERNEL_SSE
-  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK1      = ELPA2_COMPLEX_KERNEL_SSE_BLOCK1
-  integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK2      = ELPA2_COMPLEX_KERNEL_SSE_BLOCK2
-  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1      = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
-  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2      = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
-  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK1     = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1
-  integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2     = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2
-
-#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
-
-#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
-#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-! go through all kernels and set them
-#ifdef WITH_COMPLEX_GENERIC_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
-#endif
-#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
-#endif
-#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
-#endif
-
-#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
-#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
-#else
-#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
-#endif
-#endif
-#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
-
-#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
-#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
-#else
-#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
-#endif
-#endif
-#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
-
-#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
-
-#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
-
-#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-! go through all kernels and set them
-#ifdef WITH_COMPLEX_GENERIC_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
-#endif
-#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
-#endif
-#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
-#endif
-
-#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
-#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2
-#else
-#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1
-#endif
-#endif
-#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
-
-#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
-#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
-#else
-#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
-  integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
-#endif
-#endif
-#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
-
-#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
-
-  character(35), parameter, dimension(number_of_complex_kernels) :: &
-  COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC         ", &
-                                "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE  ", &
-                                "COMPLEX_ELPA_KERNEL_BGP             ", &
-                                "COMPLEX_ELPA_KERNEL_BGQ             ", &
-                                "COMPLEX_ELPA_KERNEL_SSE             ", &
-                                "COMPLEX_ELPA_KERNEL_SSE_BLOCK1      ", &
-                                "COMPLEX_ELPA_KERNEL_SSE_BLOCK2      ", &
-                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK1      ", &
-                                "COMPLEX_ELPA_KERNEL_AVX_BLOCK2      ", &
-                                "COMPLEX_ELPA_KERNEL_AVX2_BLOCK1     ", &
-                                "COMPLEX_ELPA_KERNEL_AVX2_BLOCK2     "/)
-
-  integer, parameter                                    ::             &
-           AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) =       &
-                                      (/                               &
-#if WITH_REAL_GENERIC_KERNEL
-                                        1                              &
-#else
-                                        0                              &
-#endif
-#if WITH_REAL_GENERIC_SIMPLE_KERNEL
-                                          ,1                           &
-#else
-                                          ,0                           &
-#endif
-#if WITH_REAL_BGP_KERNEL
-                                            ,1                         &
-#else
-                                            ,0                         &
-#endif
-#if WITH_REAL_BGQ_KERNEL
-                                              ,1                       &
-#else
-                                              ,0                       &
-#endif
-#if WITH_REAL_SSE_ASSEMBLY_KERNEL
-                                                ,1                     &
-#else
-                                                ,0                     &
-#endif
-#if WITH_REAL_SSE_BLOCK2_KERNEL
-                                                  ,1                   &
-#else
-                                                  ,0                   &
-#endif
-#if WITH_REAL_SSE_BLOCK4_KERNEL
-                                                    ,1                 &
-#else
-                                                    ,0                 &
-#endif
-#if WITH_REAL_SSE_BLOCK6_KERNEL
-                                                      ,1               &
-#else
-                                                      ,0               &
-
-#endif
-#if WITH_REAL_AVX_BLOCK2_KERNEL
-                                                        ,1             &
-#else
-                                                        ,0             &
-#endif
-#if WITH_REAL_AVX_BLOCK4_KERNEL
-                                                          ,1           &
-#else
-                                                          ,0           &
-#endif
-#if WITH_REAL_AVX_BLOCK6_KERNEL
-                                                            ,1         &
-#else
-                                                            ,0         &
-#endif
-#if WITH_REAL_AVX2_BLOCK2_KERNEL
-                                                              ,1       &
-#else
-                                                              ,0       &
-#endif
-#if WITH_REAL_AVX2_BLOCK4_KERNEL
-                                                               ,1      &
-#else
-                                                               ,0      &
-#endif
-#if WITH_REAL_AVX2_BLOCK6_KERNEL
-                                                               ,1      &
-#else
-                                                               ,0      &
-#endif
-
-
-                                                       /)
-
-  integer, parameter ::                                                   &
-           AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) =    &
-                                      (/                                  &
-#if WITH_COMPLEX_GENERIC_KERNEL
-                                        1                                 &
-#else
-                                        0                                 &
-#endif
-#if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-                                          ,1                              &
-#else
-                                          ,0                              &
-#endif
-#if WITH_COMPLEX_BGP_KERNEL
-                                            ,1                            &
-#else
-                                            ,0                            &
-#endif
-#if WITH_COMPLEX_BGQ_KERNEL
-                                              ,1                          &
-#else
-                                              ,0                          &
-#endif
-#if WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
-                                                ,1                        &
-#else
-                                                ,0                        &
-#endif
-#if WITH_COMPLEX_SSE_BLOCK1_KERNEL
-                                                  ,1                      &
-#else
-                                                  ,0                      &
-#endif
-#if WITH_COMPLEX_SSE_BLOCK2_KERNEL
-                                                    ,1                    &
-#else
-                                                    ,0                    &
-#endif
-
-#if WITH_COMPLEX_AVX_BLOCK1_KERNEL
-                                                      ,1                  &
-#else
-                                                      ,0                  &
-#endif
-#if WITH_COMPLEX_AVX_BLOCK2_KERNEL
-                                                        ,1                &
-#else
-                                                        ,0                &
-#endif
-#if WITH_COMPLEX_AVX2_BLOCK1_KERNEL
-                                                         ,1               &
-#else
-                                                         ,0               &
-#endif
-#if WITH_COMPLEX_AVX2_BLOCK2_KERNEL
-                                                           ,1             &
-#else
-                                                           ,0             &
-#endif
-
-                                                   /)
-
-!******
-  contains
-    subroutine print_available_real_kernels
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("print_available_real_kernels")
-#endif
-
-      do i=1, number_of_real_kernels
-        if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then
-          write(*,*) REAL_ELPA_KERNEL_NAMES(i)
-        endif
-      enddo
-      write(*,*) " "
-      write(*,*) " At the moment the following kernel would be choosen:"
-      write(*,*) get_actual_real_kernel_name()
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("print_available_real_kernels")
-#endif
-
-    end subroutine print_available_real_kernels
-
-    subroutine query_available_real_kernels
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("query_available_real_kernels")
-#endif
-
-      do i=1, number_of_real_kernels
-        if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then
-          write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
-        endif
-      enddo
-      write(error_unit,*) " "
-      write(error_unit,*) " At the moment the following kernel would be choosen:"
-      write(error_unit,*) get_actual_real_kernel_name()
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("query_available_real_kernels")
-#endif
-
-    end subroutine query_available_real_kernels
-
-    subroutine print_available_complex_kernels
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-
-      implicit none
-
-      integer :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("print_available_complex_kernels")
-#endif
-
-      do i=1, number_of_complex_kernels
-        if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then
-           write(*,*) COMPLEX_ELPA_KERNEL_NAMES(i)
-        endif
-      enddo
-      write(*,*) " "
-      write(*,*) " At the moment the following kernel would be choosen:"
-      write(*,*) get_actual_complex_kernel_name()
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("print_available_complex_kernels")
-#endif
-
-    end subroutine print_available_complex_kernels
-
-    subroutine query_available_complex_kernels
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-
-      implicit none
-
-      integer :: i
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("query_available_complex_kernels")
-#endif
-
-      do i=1, number_of_complex_kernels
-        if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then
-           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
-        endif
-      enddo
-      write(error_unit,*) " "
-      write(error_unit,*) " At the moment the following kernel would be choosen:"
-      write(error_unit,*) get_actual_complex_kernel_name()
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("query_available_complex_kernels")
-#endif
-
-    end subroutine query_available_complex_kernels
-
-    function get_actual_real_kernel() result(actual_kernel)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer :: actual_kernel
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("get_actual_real_kernel")
-#endif
-
-
-      ! if kernel is not choosen via api
-      ! check whether set by environment variable
-      actual_kernel = real_kernel_via_environment_variable()
-
-      if (actual_kernel .eq. 0) then
-        ! if not then set default kernel
-        actual_kernel = DEFAULT_REAL_ELPA_KERNEL
-      endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("get_actual_real_kernel")
-#endif
-
-    end function get_actual_real_kernel
-
-    function get_actual_real_kernel_name() result(actual_kernel_name)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      character(35) :: actual_kernel_name
-      integer       :: actual_kernel
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("get_actual_real_kernel_name")
-#endif
-
-      actual_kernel = get_actual_real_kernel()
-      actual_kernel_name = REAL_ELPA_KERNEL_NAMES(actual_kernel)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("get_actual_real_kernel_name")
-#endif
-
-    end function get_actual_real_kernel_name
-
-    function get_actual_complex_kernel() result(actual_kernel)
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-      integer :: actual_kernel
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("get_actual_complex_kernel")
-#endif
-
-
-     ! if kernel is not choosen via api
-     ! check whether set by environment variable
-     actual_kernel = complex_kernel_via_environment_variable()
-
-     if (actual_kernel .eq. 0) then
-       ! if not then set default kernel
-       actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
-     endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("get_actual_complex_kernel")
-#endif
-
-   end function get_actual_complex_kernel
-
-   function get_actual_complex_kernel_name() result(actual_kernel_name)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     character(35) :: actual_kernel_name
-     integer       :: actual_kernel
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("get_actual_complex_kernel_name")
-#endif
-
-     actual_kernel = get_actual_complex_kernel()
-     actual_kernel_name = COMPLEX_ELPA_KERNEL_NAMES(actual_kernel)
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("get_actual_complex_kernel_name")
-#endif
-
-   end function get_actual_complex_kernel_name
-
-   function check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL) result(err)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     integer, intent(in) :: THIS_REAL_ELPA_KERNEL
-
-     logical             :: err
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("check_allowed_real_kernels")
-#endif
-     err = .false.
-
-     if (AVAILABLE_REAL_ELPA_KERNELS(THIS_REAL_ELPA_KERNEL) .ne. 1) err=.true.
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("check_allowed_real_kernels")
-#endif
-
-   end function check_allowed_real_kernels
-
-   function check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL) result(err)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL
-
-     logical             :: err
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("check_allowed_complex_kernels")
-#endif
-     err = .false.
-
-     if (AVAILABLE_COMPLEX_ELPA_KERNELS(THIS_COMPLEX_ELPA_KERNEL) .ne. 1) err=.true.
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("check_allowed_complex_kernels")
-#endif
-
-   end function check_allowed_complex_kernels
-
-   function qr_decomposition_via_environment_variable(useQR) result(isSet)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     logical, intent(out) :: useQR
-     logical              :: isSet
-     CHARACTER(len=255)   :: ELPA_QR_DECOMPOSITION
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("qr_decomposition_via_environment_variable")
-#endif
-
-     isSet = .false.
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-     call get_environment_variable("ELPA_QR_DECOMPOSITION",ELPA_QR_DECOMPOSITION)
-#endif
-     if (trim(ELPA_QR_DECOMPOSITION) .eq. "yes") then
-       useQR = .true.
-       isSet = .true.
-     endif
-     if (trim(ELPA_QR_DECOMPOSITION) .eq. "no") then
-       useQR = .false.
-       isSet = .true.
-     endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("qr_decomposition_via_environment_variable")
-#endif
-
-   end function qr_decomposition_via_environment_variable
-
-
-   function real_kernel_via_environment_variable() result(kernel)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     integer :: kernel
-     CHARACTER(len=255) :: REAL_KERNEL_ENVIRONMENT
-     integer :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("real_kernel_via_environment_variable")
-#endif
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-     call get_environment_variable("REAL_ELPA_KERNEL",REAL_KERNEL_ENVIRONMENT)
-#endif
-     do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
-       !     if (trim(dummy_char) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
-       if (trim(REAL_KERNEL_ENVIRONMENT) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then
-         kernel = i
-         exit
-       else
-         kernel = 0
-       endif
-     enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("real_kernel_via_environment_variable")
-#endif
-
-   end function real_kernel_via_environment_variable
-
-   function complex_kernel_via_environment_variable() result(kernel)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     implicit none
-     integer :: kernel
-
-     CHARACTER(len=255) :: COMPLEX_KERNEL_ENVIRONMENT
-     integer :: i
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("complex_kernel_via_environment_variable")
-#endif
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-     call get_environment_variable("COMPLEX_ELPA_KERNEL",COMPLEX_KERNEL_ENVIRONMENT)
-#endif
-
-     do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
-       if (trim(COMPLEX_ELPA_KERNEL_NAMES(i)) .eq. trim(COMPLEX_KERNEL_ENVIRONMENT)) then
-         kernel = i
-         exit
-       else
-         kernel = 0
-       endif
-     enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("complex_kernel_via_environment_variable")
-#endif
-
-   end function
-!-------------------------------------------------------------------------------
-
-end module ELPA2_utilities
diff -Nru elpa-2016.05.001/src/elpa_abstract_impl.F90 elpa-2019.11.001/src/elpa_abstract_impl.F90
--- elpa-2016.05.001/src/elpa_abstract_impl.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_abstract_impl.F90	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,224 @@
+!
+!    Copyright 2017, L. Hüdepohl and A. Marek, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+#include "config-f90.h"
+!> \brief Fortran module to provide an abstract definition of the implementation. Do not use directly. Use the module "elpa"
+module elpa_abstract_impl
+  use elpa_api
+  use elpa_generated_fortran_interfaces
+
+#ifdef HAVE_DETAILED_TIMINGS
+  use ftimings
+#else
+  use timings_dummy
+#endif
+
+
+  implicit none
+
+  ! The reason to have this additional layer is to allow for members (here the
+  ! 'timer' object) that can be used internally but are not exposed to the
+  ! public API. This cannot be done via 'private' members, as the scope of
+  ! 'private' is per-file.
+  !
+  ! Thus, other sub-types or suplementary routines cannot use these members
+  ! (unless they would all be implemented in one giant file)
+  !
+  type, abstract, extends(elpa_t) :: elpa_abstract_impl_t
+#ifdef HAVE_DETAILED_TIMINGS
+    type(timer_t) :: timer
+    type(timer_t) :: autotune_timer
+#else
+    type(timer_dummy_t) :: timer
+    type(timer_dummy_t) :: autotune_timer
+#endif
+    type(c_ptr)         :: index = C_NULL_PTR
+    logical             :: eigenvalues_only
+    contains
+      procedure, public :: elpa_set_integer                      !< private methods to implement the setting of an integer/double key/value pair
+      procedure, public :: elpa_set_double
+
+      procedure, public :: elpa_get_integer                      !< private methods to implement the querry of an integer/double key/value pair
+      procedure, public :: elpa_get_double
+
+  end type
+
+  contains
+
+    !> \brief internal subroutine to set an integer key/value pair
+    !> Parameters
+    !> \param   self       the allocated ELPA object
+    !> \param   name       string, the key
+    !> \param   value      integer, the value to be set
+    !> \result  error      integer, the error code
+    subroutine elpa_set_integer(self, name, value, error)
+      use iso_c_binding
+      use elpa_utilities, only : error_unit
+      class(elpa_abstract_impl_t)     :: self
+      character(*), intent(in)        :: name
+      integer(kind=c_int), intent(in) :: value
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+      integer                         :: actual_error
+
+      actual_error = elpa_index_set_int_value_c(self%index, name // c_null_char, value)
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = actual_error
+
+      else if (actual_error /= ELPA_OK) then
+        write(error_unit,'(a,i0,a)') "ELPA: Error setting option '" // name // "' to value ", value, &
+                " (got: " // elpa_strerr(actual_error) // ") and you did not check for errors!"
+      end if
+#else
+      error = actual_error
+#endif
+    end subroutine
+
+    !> \brief internal subroutine to get an integer key/value pair
+    !> Parameters
+    !> \param   self       the allocated ELPA object
+    !> \param   name       string, the key
+    !> \param   value      integer, the value of the key/vaue pair
+    !> \param   error      integer, optional, to store an error code
+    subroutine elpa_get_integer(self, name, value, error)
+      use iso_c_binding
+      use elpa_utilities, only : error_unit
+      class(elpa_abstract_impl_t)    :: self
+      character(*), intent(in)       :: name
+      integer(kind=c_int)            :: value
+#ifdef USE_FORTRAN2008
+      integer, intent(out), optional :: error
+#else
+      integer, intent(out)           :: error
+#endif
+      integer                        :: actual_error
+
+      value = elpa_index_get_int_value_c(self%index, name // c_null_char, actual_error)
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+       error = actual_error
+      else if (actual_error /= ELPA_OK) then
+        write(error_unit,'(a)') "ELPA: Error getting option '" // name // "'" // &
+                " (got: " // elpa_strerr(actual_error) // ") and you did not check for errors!"
+      end if
+#else
+      error = actual_error
+#endif
+    end subroutine
+
+    !> \brief internal subroutine to set a double key/value pair
+    !> Parameters
+    !> \param   self       the allocated ELPA object
+    !> \param   name       string, the key
+    !> \param   value      double, the value to be set
+    !> \result  error      integer, the error code
+    subroutine elpa_set_double(self, name, value, error)
+      use iso_c_binding
+      use elpa_utilities, only : error_unit
+      class(elpa_abstract_impl_t)     :: self
+      character(*), intent(in)        :: name
+      real(kind=c_double), intent(in) :: value
+      integer                         :: actual_error
+
+#ifdef USE_FORTRAN2008
+      integer,              optional  :: error
+#else
+      integer                         :: error
+#endif
+      actual_error = elpa_index_set_double_value_c(self%index, name // c_null_char, value)
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+       error = actual_error
+      else if (actual_error /= ELPA_OK) then
+        write(error_unit,'(a,es12.5,a)') "ELPA: Error setting option '" // name // "' to value ", value, &
+                " (got: " // elpa_strerr(actual_error) // ") and you did not check for errors!"
+      end if
+#else
+      error = actual_error
+#endif
+    end subroutine
+
+    !> \brief internal subroutine to get an double key/value pair
+    !> Parameters
+    !> \param   self       the allocated ELPA object
+    !> \param   name       string, the key
+    !> \param   value      double, the value of the key/vaue pair
+    !> \param   error      integer, optional, to store an error code
+    subroutine elpa_get_double(self, name, value, error)
+      use iso_c_binding
+      use elpa_utilities, only : error_unit
+      class(elpa_abstract_impl_t)    :: self
+      character(*), intent(in)       :: name
+      real(kind=c_double)            :: value
+#ifdef USE_FORTRAN2008
+      integer, intent(out), optional :: error
+#else
+      integer, intent(out)           :: error
+#endif
+      integer                        :: actual_error
+
+      value = elpa_index_get_double_value_c(self%index, name // c_null_char, actual_error)
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+       error = actual_error
+      else if (actual_error /= ELPA_OK) then
+        write(error_unit,'(a)') "ELPA: Error getting option '" // name // "'" // &
+                " (got: " // elpa_strerr(actual_error) // ") and you did not check for errors!"
+      end if
+#else
+      error = actual_error
+#endif
+    end subroutine
+
+end module
diff -Nru elpa-2016.05.001/src/elpa_api.F90 elpa-2019.11.001/src/elpa_api.F90
--- elpa-2016.05.001/src/elpa_api.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_api.F90	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,1016 @@
+!
+!    Copyright 2017, L. Hüdepohl and A. Marek, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!> \brief Fortran module which provides the definition of the ELPA API. Do not use directly! Use the module "elpa"
+
+#include "config-f90.h"
+
+module elpa_api
+  use elpa_constants
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+#include "src/elpa_generated_public_fortran_interfaces.h"
+
+  integer, private, parameter :: earliest_api_version = EARLIEST_API_VERSION !< Definition of the earliest API version supported
+                                                                             !< with the current release
+  integer, private, parameter :: current_api_version  = CURRENT_API_VERSION  !< Definition of the current API version
+
+  integer, private, parameter :: earliest_autotune_version = EARLIEST_AUTOTUNE_VERSION !< Definition of the earliest API version
+                                                                                       !< which supports autotuning
+  integer, private            :: api_version_set
+  logical, private            :: initDone = .false.
+
+  public :: elpa_t, &
+      c_int, &
+      c_double, c_double_complex, &
+      c_float, c_float_complex
+
+  !> \brief Abstract definition of the elpa_t type
+  type, abstract :: elpa_t
+    private
+
+
+    !< these have to be public for proper bounds checking, sadly
+    integer(kind=c_int), public, pointer :: na => NULL()
+    integer(kind=c_int), public, pointer :: nev => NULL()
+    integer(kind=c_int), public, pointer :: local_nrows => NULL()
+    integer(kind=c_int), public, pointer :: local_ncols => NULL()
+    integer(kind=c_int), public, pointer :: nblk => NULL()
+
+    contains
+      ! general
+      procedure(elpa_setup_i),   deferred, public :: setup          !< method to setup an ELPA object
+      procedure(elpa_destroy_i), deferred, public :: destroy        !< method to destroy an ELPA object
+
+      ! key/value store
+      generic, public :: set => &                                   !< export a method to set integer/double key/values
+          elpa_set_integer, &
+          elpa_set_double
+
+      generic, public :: get => &                                   !< export a method to get integer/double key/values
+          elpa_get_integer, &
+          elpa_get_double
+
+      procedure(elpa_is_set_i),  deferred, public :: is_set         !< method to check whether key/value is set
+      procedure(elpa_can_set_i), deferred, public :: can_set        !< method to check whether key/value can be set
+
+      ! call before setup if created from the legacy api
+      ! remove this function completely after the legacy api is dropped
+      procedure(elpa_creating_from_legacy_api_i), deferred, public :: creating_from_legacy_api
+
+      ! Timer
+      procedure(elpa_get_time_i), deferred, public :: get_time        !< method to get the times from the timer object
+      procedure(elpa_print_times_i), deferred, public :: print_times  !< method to print the timings tree
+      procedure(elpa_timer_start_i), deferred, public :: timer_start  !< method to start a time measurement
+      procedure(elpa_timer_stop_i), deferred, public :: timer_stop    !< method to stop a time measurement
+
+
+      ! Actual math routines
+      generic, public :: eigenvectors => &                          !< method eigenvectors for solving the full eigenvalue problem
+          elpa_eigenvectors_d, &                                    !< the eigenvalues and (parts of) the eigenvectors are computed
+          elpa_eigenvectors_f, &                                    !< for symmetric real valued / hermitian complex valued matrices
+          elpa_eigenvectors_dc, &
+          elpa_eigenvectors_fc
+
+      generic, public :: eigenvalues => &                           !< method eigenvalues for solving the eigenvalue problem
+          elpa_eigenvalues_d, &                                     !< only the eigenvalues are computed
+          elpa_eigenvalues_f, &                                     !< for symmetric real valued / hermitian complex valued matrices
+          elpa_eigenvalues_dc, &
+          elpa_eigenvalues_fc
+
+      generic, public :: skew_eigenvectors => &                     !< method skew_eigenvectors for solving the full skew-symmetric eigenvalue problem
+          elpa_skew_eigenvectors_d, &                               !< the eigenvalues and (parts of) the eigenvectors are computed
+          elpa_skew_eigenvectors_f                                  !< for symmetric real valued skew-symmetric matrices
+
+      generic, public :: skew_eigenvalues => &                      !< method skew_eigenvalues for solving the skew-symmetric eigenvalue problem
+          elpa_skew_eigenvalues_d, &                                !< only the eigenvalues are computed
+          elpa_skew_eigenvalues_f                                   !< for symmetric real valued skew-symmetric matrices
+
+
+      generic, public :: generalized_eigenvectors => &              !< method eigenvectors for solving the full generalized eigenvalue problem
+          elpa_generalized_eigenvectors_d, &                        !< the eigenvalues and (parts of) the eigenvectors are computed
+          elpa_generalized_eigenvectors_f, &                        !< for symmetric real valued / hermitian complex valued matrices
+          elpa_generalized_eigenvectors_dc, &
+          elpa_generalized_eigenvectors_fc
+
+      generic, public :: generalized_eigenvalues => &              !< method eigenvectors for solving the full generalized eigenvalue problem
+          elpa_generalized_eigenvalues_d, &                        !< only the eigenvalues
+          elpa_generalized_eigenvalues_f, &                        !< for symmetric real valued / hermitian complex valued matrices
+          elpa_generalized_eigenvalues_dc, &
+          elpa_generalized_eigenvalues_fc
+
+      generic, public :: hermitian_multiply => &                    !< method for a "hermitian" multiplication of matrices a and b
+          elpa_hermitian_multiply_d, &                              !< for real valued matrices:   a**T * b
+          elpa_hermitian_multiply_dc, &                             !< for complex valued matrices a**H * b
+          elpa_hermitian_multiply_f, &
+          elpa_hermitian_multiply_fc
+
+      generic, public :: cholesky => &                              !< method for the cholesky factorisation of matrix a
+          elpa_cholesky_d, &
+          elpa_cholesky_f, &
+          elpa_cholesky_dc, &
+          elpa_cholesky_fc
+
+      generic, public :: invert_triangular => &                     !< method to invert a upper triangular matrix a
+          elpa_invert_trm_d, &
+          elpa_invert_trm_f, &
+          elpa_invert_trm_dc, &
+          elpa_invert_trm_fc
+
+      generic, public :: solve_tridiagonal => &                      !< method to solve the eigenvalue problem for a tridiagonal
+          elpa_solve_tridiagonal_d, &                                !< matrix
+          elpa_solve_tridiagonal_f
+
+      procedure(print_settings_i), deferred, public :: print_settings !< method to print all parameters
+      procedure(store_settings_i), deferred, public :: store_settings !< method to save all parameters
+      procedure(load_settings_i), deferred, public :: load_settings !< method to save all parameters
+#ifdef ENABLE_AUTOTUNING
+      ! Auto-tune
+      procedure(elpa_autotune_setup_i), deferred, public :: autotune_setup       !< method to prepare the ELPA autotuning
+      procedure(elpa_autotune_step_i), deferred, public :: autotune_step         !< method to do an autotuning step
+      procedure(elpa_autotune_set_best_i), deferred, public :: autotune_set_best !< method to set the best options
+      procedure(elpa_autotune_print_best_i), deferred, public :: autotune_print_best !< method to print the best options
+      procedure(elpa_autotune_print_state_i), deferred, public :: autotune_print_state !< method to print the state
+      procedure(elpa_autotune_save_state_i), deferred, public :: autotune_save_state !< method to save the state
+      procedure(elpa_autotune_load_state_i), deferred, public :: autotune_load_state !< method to load the state
+#endif
+
+      !> \brief These method have to be public, in order to be overrideable in the extension types
+      procedure(elpa_set_integer_i), deferred, public :: elpa_set_integer
+      procedure(elpa_set_double_i),  deferred, public :: elpa_set_double
+
+      procedure(elpa_get_integer_i), deferred, public :: elpa_get_integer
+      procedure(elpa_get_double_i),  deferred, public :: elpa_get_double
+
+      procedure(elpa_eigenvectors_d_i),    deferred, public :: elpa_eigenvectors_d
+      procedure(elpa_eigenvectors_f_i),    deferred, public :: elpa_eigenvectors_f
+      procedure(elpa_eigenvectors_dc_i), deferred, public :: elpa_eigenvectors_dc
+      procedure(elpa_eigenvectors_fc_i), deferred, public :: elpa_eigenvectors_fc
+
+      procedure(elpa_eigenvalues_d_i),    deferred, public :: elpa_eigenvalues_d
+      procedure(elpa_eigenvalues_f_i),    deferred, public :: elpa_eigenvalues_f
+      procedure(elpa_eigenvalues_dc_i), deferred, public :: elpa_eigenvalues_dc
+      procedure(elpa_eigenvalues_fc_i), deferred, public :: elpa_eigenvalues_fc
+
+      procedure(elpa_skew_eigenvectors_d_i),    deferred, public :: elpa_skew_eigenvectors_d
+      procedure(elpa_skew_eigenvectors_f_i),    deferred, public :: elpa_skew_eigenvectors_f
+
+
+      procedure(elpa_skew_eigenvalues_d_i),    deferred, public :: elpa_skew_eigenvalues_d
+      procedure(elpa_skew_eigenvalues_f_i),    deferred, public :: elpa_skew_eigenvalues_f
+
+      procedure(elpa_generalized_eigenvectors_d_i),    deferred, public :: elpa_generalized_eigenvectors_d
+      procedure(elpa_generalized_eigenvectors_f_i),    deferred, public :: elpa_generalized_eigenvectors_f
+      procedure(elpa_generalized_eigenvectors_dc_i), deferred, public :: elpa_generalized_eigenvectors_dc
+      procedure(elpa_generalized_eigenvectors_fc_i), deferred, public :: elpa_generalized_eigenvectors_fc
+
+      procedure(elpa_generalized_eigenvalues_d_i),    deferred, public :: elpa_generalized_eigenvalues_d
+      procedure(elpa_generalized_eigenvalues_f_i),    deferred, public :: elpa_generalized_eigenvalues_f
+      procedure(elpa_generalized_eigenvalues_dc_i), deferred, public :: elpa_generalized_eigenvalues_dc
+      procedure(elpa_generalized_eigenvalues_fc_i), deferred, public :: elpa_generalized_eigenvalues_fc
+
+      procedure(elpa_hermitian_multiply_d_i),  deferred, public :: elpa_hermitian_multiply_d
+      procedure(elpa_hermitian_multiply_f_i),  deferred, public :: elpa_hermitian_multiply_f
+      procedure(elpa_hermitian_multiply_dc_i), deferred, public :: elpa_hermitian_multiply_dc
+      procedure(elpa_hermitian_multiply_fc_i), deferred, public :: elpa_hermitian_multiply_fc
+
+      procedure(elpa_cholesky_d_i),    deferred, public :: elpa_cholesky_d
+      procedure(elpa_cholesky_f_i),    deferred, public :: elpa_cholesky_f
+      procedure(elpa_cholesky_dc_i), deferred, public :: elpa_cholesky_dc
+      procedure(elpa_cholesky_fc_i), deferred, public :: elpa_cholesky_fc
+
+      procedure(elpa_invert_trm_d_i),    deferred, public :: elpa_invert_trm_d
+      procedure(elpa_invert_trm_f_i),    deferred, public :: elpa_invert_trm_f
+      procedure(elpa_invert_trm_dc_i), deferred, public :: elpa_invert_trm_dc
+      procedure(elpa_invert_trm_fc_i), deferred, public :: elpa_invert_trm_fc
+
+      procedure(elpa_solve_tridiagonal_d_i), deferred, public :: elpa_solve_tridiagonal_d
+      procedure(elpa_solve_tridiagonal_f_i), deferred, public :: elpa_solve_tridiagonal_f
+  end type elpa_t
+
+#ifdef ENABLE_AUTOTUNING
+  !> \brief Abstract definition of the elpa_autotune type
+  type, abstract :: elpa_autotune_t
+    private
+    contains
+      procedure(elpa_autotune_destroy_i), deferred, public :: destroy
+      procedure(elpa_autotune_print_i), deferred, public :: print
+  end type
+#endif
+
+  !> \brief definition of helper function to get C strlen
+  !> Parameters
+  !> \details
+  !> \param   ptr         type(c_ptr) : pointer to string
+  !> \result  size        integer(kind=c_size_t) : length of string
+  interface
+    pure function elpa_strlen_c(ptr) result(size) bind(c, name="strlen")
+      use, intrinsic :: iso_c_binding
+      implicit none
+      type(c_ptr), intent(in), value :: ptr
+      integer(kind=c_size_t) :: size
+    end function
+  end interface
+
+
+  !> \brief abstract definition of the ELPA setup method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \result  error       integer : error code, which can be queried with elpa_strerr()
+  abstract interface
+    function elpa_setup_i(self) result(error)
+      import elpa_t
+      implicit none
+      class(elpa_t), intent(inout)   :: self
+      integer                        :: error
+    end function
+  end interface
+
+  !> \brief abstract definition of the print_settings method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   error       integer, optional
+  !> Prints all the elpa parameters
+  abstract interface
+    subroutine print_settings_i(self, error)
+      import elpa_t
+      implicit none
+      class(elpa_t), intent(inout)   :: self
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out) :: error
+#else
+      integer, intent(out)           :: error
+#endif
+    end subroutine
+  end interface
+
+  !> \brief abstract definition of the store_settings method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   file_name   string, the name of the file where to save the parameters
+  !> \param   error       integer, optional
+  !> Saves all the elpa parameters
+  abstract interface
+    subroutine store_settings_i(self, file_name, error)
+      import elpa_t
+      implicit none
+      class(elpa_t), intent(inout)  :: self
+      character(*), intent(in)      :: file_name
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out):: error
+#else
+      integer, intent(out)          :: error
+#endif
+    end subroutine
+  end interface
+
+  !> \brief abstract definition of the load_settings method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   file_name   string, the name of the file from which to load the parameters
+  !> \param   error       integer, optional
+  !> Loads all the elpa parameters
+  abstract interface
+    subroutine load_settings_i(self, file_name, error)
+      import elpa_t
+      implicit none
+      class(elpa_t), intent(inout)   :: self
+      character(*), intent(in)       :: file_name
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out) :: error
+#else
+      integer, intent(out)           :: error
+#endif
+    end subroutine
+  end interface
+
+#ifdef ENABLE_AUTOTUNING
+  !> \brief abstract definition of the autotune setup method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object, which should be tuned
+  !> \param   level       integer: the level of "thoroughness" of the tuning steps
+  !> \param   domain      integer: domain (real/complex) which should be tuned
+  !> \result  tune_state  class(elpa_autotune_t): the autotuning object
+  abstract interface
+    function elpa_autotune_setup_i(self, level, domain, error) result(tune_state)
+      import elpa_t, elpa_autotune_t
+      implicit none
+      class(elpa_t), intent(inout), target :: self
+      integer, intent(in)                  :: level, domain
+      class(elpa_autotune_t), pointer      :: tune_state
+#ifdef USE_FORTRAN2008
+      integer , optional                   :: error
+#else
+      integer                              :: error
+#endif
+    end function
+  end interface
+
+
+  !> \brief abstract definition of the autotune step method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object, which should be tuned
+  !> \param   tune_state  class(elpa_autotune_t): the autotuning object
+  !> \param   unfinished  logical: state whether tuning is unfinished or not
+  !> \param   error       integer, optional
+  abstract interface
+    function elpa_autotune_step_i(self, tune_state, error) result(unfinished)
+      import elpa_t, elpa_autotune_t
+      implicit none
+      class(elpa_t), intent(inout)                  :: self
+      class(elpa_autotune_t), intent(inout), target :: tune_state
+      logical                                       :: unfinished
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)                :: error
+#else
+      integer, intent(out)                          :: error
+#endif
+    end function
+  end interface
+
+  
+  !> \brief abstract definition of the autotune set_best method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object, which should be tuned
+  !> \param   tune_state  class(elpa_autotune_t): the autotuning object
+  !> \param   error       integer, optional
+  !> Sets the best combination of ELPA options
+  abstract interface
+    subroutine elpa_autotune_set_best_i(self, tune_state, error)
+      import elpa_t, elpa_autotune_t
+      implicit none
+      class(elpa_t), intent(inout)               :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)             :: error
+#else
+      integer, intent(out)                       :: error
+
+#endif
+    end subroutine
+  end interface
+
+  
+  !> \brief abstract definition of the autotune print best method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object, which should be tuned
+  !> \param   tune_state  class(elpa_autotune_t): the autotuning object
+  !> \param   error       integer, optional
+  !> Prints the best combination of ELPA options
+  abstract interface
+    subroutine elpa_autotune_print_best_i(self, tune_state, error)
+      import elpa_t, elpa_autotune_t
+      implicit none
+      class(elpa_t), intent(inout)               :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)             :: error
+#else
+      integer, intent(out)                       :: error
+
+#endif
+    end subroutine
+  end interface
+
+  !> \brief abstract definition of the autotune print state method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object, which should be tuned
+  !> \param   tune_state  class(elpa_autotune_t): the autotuning object
+  !> \param   error       integer, optional
+  !> Prints the autotuning state
+  abstract interface
+    subroutine elpa_autotune_print_state_i(self, tune_state, error)
+      import elpa_t, elpa_autotune_t
+      implicit none
+      class(elpa_t), intent(inout)               :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)             :: error
+#else
+      integer,  intent(out)                      :: error
+#endif
+    end subroutine
+  end interface
+
+  !> \brief abstract definition of the autotune save state method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object, which should be tuned
+  !> \param   tune_state  class(elpa_autotune_t): the autotuning object
+  !> \param   file_name   string, the name of the file where to save the state
+  !> \param   error       integer, optional
+  !> Saves the autotuning state
+  abstract interface
+    subroutine elpa_autotune_save_state_i(self, tune_state, file_name, error)
+      import elpa_t, elpa_autotune_t
+      implicit none
+      class(elpa_t), intent(inout)               :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+      character(*), intent(in)                   :: file_name
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)             :: error
+#else
+      integer, intent(out)                       :: error
+#endif
+    end subroutine
+  end interface
+
+  !> \brief abstract definition of the autotune load state method
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object, which is being tuned
+  !> \param   tune_state  class(elpa_autotune_t): the autotuning object
+  !> \param   file_name   string, the name of the file from which to load the autotuning state
+  !> \param   error       integer, optional
+  !> Loads all the elpa parameters
+  abstract interface
+    subroutine elpa_autotune_load_state_i(self, tune_state, file_name, error)
+      import elpa_t, elpa_autotune_t
+      implicit none
+      class(elpa_t), intent(inout)               :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+      character(*), intent(in)                   :: file_name
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)             :: error
+#else
+      integer, intent(out)                       :: error
+#endif
+    end subroutine
+  end interface
+#endif
+
+  !> \brief abstract definition of set method for integer values
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        string: the name of the key
+  !> \param   value       integer : the value to set for the key
+  !> \param   error       integer, optional : error code, which can be queried with elpa_strerr()
+  abstract interface
+    subroutine elpa_set_integer_i(self, name, value, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                   :: self
+      character(*), intent(in)        :: name
+      integer(kind=c_int), intent(in) :: value
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+    end subroutine
+  end interface
+
+
+  !> \brief abstract definition of get method for integer values
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        string: the name of the key
+  !> \param   value       integer : the value corresponding to the key
+  !> \param   error       integer, optional : error code, which can be queried with elpa_strerr()
+  abstract interface
+    subroutine elpa_get_integer_i(self, name, value, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                  :: self
+      character(*), intent(in)       :: name
+      integer(kind=c_int)            :: value
+#ifdef USE_FORTRAN2008
+      integer, intent(out), optional :: error
+#else
+      integer, intent(out)           :: error
+#endif
+    end subroutine
+  end interface
+
+
+  !> \brief abstract definition of is_set method for integer values
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        string: the name of the key
+  !> \result  state       integer : 1 is set, 0 if not, else a negativ error code
+  !>                                                    which can be queried with elpa_strerr
+  abstract interface
+    function elpa_is_set_i(self, name) result(state)
+      import elpa_t
+      implicit none
+      class(elpa_t)            :: self
+      character(*), intent(in) :: name
+      integer                  :: state
+    end function
+  end interface
+
+
+  !> \brief abstract definition of can_set method for integer values
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        string: the name of the key
+  !> \param   value       integer: the valye to associate with the key
+  !> \result  state       integer : 1 is set, 0 if not, else a negativ error code
+  !>                                                    which can be queried with elpa_strerr
+  abstract interface
+    function elpa_can_set_i(self, name, value) result(state)
+      import elpa_t, c_int
+      implicit none
+      class(elpa_t)                   :: self
+      character(*), intent(in)        :: name
+      integer(kind=c_int), intent(in) :: value
+      integer                         :: state
+    end function
+  end interface
+
+
+  !> \brief abstract definition of set method for double values
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        string: the name of the key
+  !? \param   value       double: the value to associate with the key
+  !> \param   error       integer. optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_set_double_i(self, name, value, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                   :: self
+      character(*), intent(in)        :: name
+      real(kind=c_double), intent(in) :: value
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+    end subroutine
+  end interface
+
+
+  !> \brief abstract definition of get method for double values
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        string: the name of the key
+  !> \param   value       double: the value associated with the key
+  !> \param   error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_get_double_i(self, name, value, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                  :: self
+      character(*), intent(in)       :: name
+      real(kind=c_double)            :: value
+#ifdef USE_FORTRAN2008
+      integer, intent(out), optional :: error
+#else
+      integer, intent(out)           :: error
+#endif
+    end subroutine
+  end interface
+
+
+  !> \brief abstract definition of associate method for integer pointers
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        string: the name of the key
+  !> \result  value       integer, pointer: the value associated with the key
+  abstract interface
+    function elpa_associate_int_i(self, name) result(value)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                  :: self
+      character(*), intent(in)       :: name
+      integer(kind=c_int), pointer   :: value
+    end function
+  end interface
+
+
+  ! Timer routines
+
+  !> \brief abstract definition of get_time method to querry the timer
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name1..6    string: the name of the timer entry, supports up to 6 levels
+  !> \result  s           double: the time for the entry name1..6
+  abstract interface
+    function elpa_get_time_i(self, name1, name2, name3, name4, name5, name6) result(s)
+      import elpa_t, c_double
+      implicit none
+      class(elpa_t), intent(in) :: self
+      ! this is clunky, but what can you do..
+      character(len=*), intent(in), optional :: name1, name2, name3, name4, name5, name6
+      real(kind=c_double) :: s
+    end function
+  end interface
+
+
+  !> \brief abstract definition of print method for timer
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  abstract interface
+    subroutine elpa_print_times_i(self, name1, name2, name3, name4)
+      import elpa_t
+      implicit none
+      class(elpa_t), intent(in) :: self
+      character(len=*), intent(in), optional :: name1, name2, name3, name4
+    end subroutine
+  end interface
+
+
+  !> \brief abstract definition of the start method for timer
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        character(len=*) the name of the entry int the timer tree
+  abstract interface
+    subroutine elpa_timer_start_i(self, name)
+      import elpa_t
+      implicit none
+      class(elpa_t), intent(inout) :: self
+      character(len=*), intent(in) :: name
+    end subroutine
+  end interface
+
+
+  !> \brief abstract definition of the stop method for timer
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t): the ELPA object
+  !> \param   name        character(len=*) the name of the entry int the timer tree
+  abstract interface
+    subroutine elpa_timer_stop_i(self, name)
+      import elpa_t
+      implicit none
+      class(elpa_t), intent(inout) :: self
+      character(len=*), intent(in) :: name
+    end subroutine
+  end interface
+
+  ! Actual math routines
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "general/precision_macros.h"
+#include "elpa_api_math_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "general/precision_macros.h"
+#include "elpa_api_math_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "general/precision_macros.h"
+#include "elpa_api_math_template.F90"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION
+#include "general/precision_macros.h"
+#include "elpa_api_math_template.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+
+! end of math routines
+
+  !> \brief abstract definition of interface to destroy an ELPA object
+  !> Parameters
+  !> \param   self        class(elpa_t), the ELPA object
+  !> \param   error       integer, optional, the error code
+  abstract interface
+    subroutine elpa_destroy_i(self, error)
+      import elpa_t
+      implicit none
+      class(elpa_t)                  :: self
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out) :: error
+#else
+      integer, intent(out)           :: error
+#endif
+    end subroutine
+  end interface
+
+#ifdef ENABLE_AUTOTUNING
+  !> \brief abstract definition of interface to print the autotuning state
+  !> Parameters
+  !> \param   self        class(elpa_autotune_t): the ELPA autotune object
+  abstract interface
+    subroutine elpa_autotune_print_i(self, error)
+      import elpa_autotune_t
+      implicit none
+      class(elpa_autotune_t), intent(in) :: self
+#ifdef USE_FORTRAN2008
+      integer, intent(out), optional     :: error
+#else
+      integer, intent(out)               :: error
+#endif
+
+    end subroutine
+  end interface
+
+ 
+  !> \brief abstract definition of interface to destroy the autotuning state
+  !> Parameters
+  !> \param   self        class(elpa_autotune_t): the ELPA autotune object
+  abstract interface
+    subroutine elpa_autotune_destroy_i(self, error)
+      import elpa_autotune_t
+      implicit none
+      class(elpa_autotune_t), intent(inout) :: self
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)        :: error
+#else
+      integer, intent(out)                  :: error
+#endif
+    end subroutine
+  end interface
+#endif
+
+  abstract interface
+    subroutine elpa_creating_from_legacy_api_i(self)
+      import elpa_t
+      implicit none
+      class(elpa_t), intent(inout)          :: self
+    end subroutine
+  end interface
+
+  contains
+
+
+    !> \brief function to intialize the ELPA library
+    !> Parameters
+    !> \param   api_version integer: api_version that ELPA should use
+    !> \result  error       integer: error code, which can be queried with elpa_strerr
+    !
+    !c> int elpa_init(int api_version);
+    function elpa_init(api_version) result(error) bind(C, name="elpa_init")
+      use elpa_utilities, only : error_unit
+      use iso_c_binding
+      integer(kind=c_int), intent(in), value :: api_version
+      integer(kind=c_int)                    :: error
+
+      if (earliest_api_version <= api_version .and. api_version <= current_api_version) then
+        initDone = .true.
+        api_version_set = api_version
+        error = ELPA_OK
+      else
+        write(error_unit, "(a,i0,a)") "ELPA: Error API version ", api_version," is not supported by this library"
+        error = ELPA_ERROR_API_VERSION
+      endif
+
+    end function
+
+
+    !> \brief function to check whether the ELPA library has been correctly initialised
+    !> Parameters
+    !> \result  state      integer: state is either ELPA_OK or ELPA_ERROR, which can be queried with elpa_strerr
+    function elpa_initialized() result(state)
+      integer :: state
+      if (initDone) then
+        state = ELPA_OK
+      else
+        state = ELPA_ERROR_CRITICAL
+      endif
+    end function
+
+    function elpa_get_api_version() result(api_version)
+       integer :: api_version
+
+       api_version = api_version_set
+    end function
+
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #define elpa_uninit(...) CONC(elpa_uninit, NARGS(__VA_ARGS__))(__VA_ARGS__)
+    !c_o> #endif
+#endif
+    !> \brief subroutine to uninit the ELPA library. Does nothing at the moment. Might do sth. later
+    !
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> void elpa_uninit1(int *error);
+    !c_o> void elpa_uninit0();
+    !c_o> #endif
+    subroutine elpa_uninit_c1(error) bind(C, name="elpa_uninit1")
+      integer(kind=c_int)        :: error
+      call elpa_uninit(error)
+    end subroutine
+
+    subroutine elpa_uninit_c0() bind(C, name="elpa_uninit0")
+      call elpa_uninit()
+    end subroutine
+#else
+    !c_no> #ifndef OPTIONAL_C_ERROR_ARGUMENT
+    !c_no> void elpa_uninit(int *error);
+    !c_no> #endif
+    subroutine elpa_uninit_c(error) bind(C, name="elpa_uninit")
+      integer(kind=c_int)        :: error
+      call elpa_uninit(error)
+    end subroutine
+#endif
+
+    subroutine elpa_uninit(error)
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out) :: error
+#else
+      integer, intent(out)           :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+     if (present(error)) error = ELPA_OK
+#else
+     error = ELPA_OK
+#endif
+    end subroutine
+    !> \brief helper function for error strings
+    !> Parameters
+    !> \param   elpa_error  integer: error code to querry
+    !> \result  string      string:  error string
+    function elpa_strerr(elpa_error) result(string)
+      integer, intent(in) :: elpa_error
+      character(kind=C_CHAR, len=elpa_strlen_c(elpa_strerr_c(elpa_error))), pointer :: string
+      call c_f_pointer(elpa_strerr_c(elpa_error), string)
+    end function
+
+
+    !> \brief helper function for c strings
+    !> Parameters
+    !> \param   ptr         type(c_ptr)
+    !> \result  string      string
+    function elpa_c_string(ptr) result(string)
+      use, intrinsic :: iso_c_binding
+      type(c_ptr), intent(in) :: ptr
+      character(kind=c_char, len=elpa_strlen_c(ptr)), pointer :: string
+      call c_f_pointer(ptr, string)
+    end function
+
+
+    !> \brief function to convert an integer in its string representation
+    !> Parameters
+    !> \param   name        string: the key
+    !> \param   value       integer: the value correponding to the key
+    !> \param   error       integer, optional: error code, which can be queried with elpa_strerr()
+    !> \result  string      string: the string representation
+    function elpa_int_value_to_string(name, value, error) result(string)
+      use elpa_utilities, only : error_unit
+      implicit none
+      character(kind=c_char, len=*), intent(in) :: name
+      integer(kind=c_int), intent(in) :: value
+      integer(kind=c_int), intent(out), optional :: error
+
+#ifdef PGI_VARIABLE_STRING_BUG
+      character(kind=c_char, len=elpa_int_value_to_strlen_c(name // C_NULL_CHAR, value)), pointer :: string_ptr
+      character(kind=c_char, len=elpa_int_value_to_strlen_c(name // C_NULL_CHAR, value)) :: string
+#else
+      character(kind=c_char, len=elpa_int_value_to_strlen_c(name // C_NULL_CHAR, value)), pointer :: string
+#endif
+
+      integer(kind=c_int) :: actual_error
+      type(c_ptr) :: ptr
+
+      actual_error = elpa_int_value_to_string_c(name // C_NULL_CHAR, value, ptr)
+      if (c_associated(ptr)) then
+#ifdef PGI_VARIABLE_STRING_BUG
+        call c_f_pointer(ptr, string_ptr)
+        string = string_ptr
+#else
+        call c_f_pointer(ptr, string)
+#endif
+      else
+#ifdef PGI_VARIABLE_STRING_BUG
+        nullify(string_ptr)
+#else
+        nullify(string)
+#endif
+      endif
+
+      if (present(error)) then
+        error = actual_error
+      else if (actual_error /= ELPA_OK) then
+        write(error_unit,'(a,i0,a)') "ELPA: Error converting value '", value, "' to a string for option '" // &
+                name // "' and you did not check for errors: " // elpa_strerr(actual_error)
+      endif
+    end function
+
+
+    !> \brief function to convert a string in its integer representation:
+    !> Parameters
+    !> \param   name        string: the key
+    !> \param   string      string: the string whose integer representation should be associated with the key
+    !> \param   error       integer, optional: error code, which can be queried with elpa_strerr()
+    !> \result  value       integer: the integer representation of the string
+    function elpa_int_string_to_value(name, string, error) result(value)
+      use elpa_generated_fortran_interfaces
+      use elpa_utilities, only : error_unit
+      implicit none
+      character(kind=c_char, len=*), intent(in)         :: name
+      character(kind=c_char, len=*), intent(in), target :: string
+      integer(kind=c_int), intent(out), optional        :: error
+      integer(kind=c_int)                               :: actual_error
+
+      integer(kind=c_int)                               :: value
+
+      actual_error = elpa_int_string_to_value_c(name // C_NULL_CHAR, string // C_NULL_CHAR, value)
+
+      if (present(error)) then
+        error = actual_error
+      else if (actual_error /= ELPA_OK) then
+        write(error_unit,'(a)') "ELPA: Error converting string '" // string // "' to value for option '" // &
+                name // "' and you did not check for errors: " // elpa_strerr(actual_error)
+      endif
+    end function
+
+
+    !> \brief function to get the number of possible choices for an option
+    !> Parameters
+    !> \param   option_name string:   the option
+    !> \result  number      integer:  the total number of possible values to be chosen
+    function elpa_option_cardinality(option_name) result(number)
+      use elpa_generated_fortran_interfaces
+      character(kind=c_char, len=*), intent(in) :: option_name
+      integer                                   :: number
+      number = elpa_option_cardinality_c(option_name // C_NULL_CHAR)
+    end function
+
+
+    !> \brief function to enumerate an option
+    !> Parameters
+    !> \param   option_name string: the option
+    !> \param   i           integer
+    !> \result  option      integer
+    function elpa_option_enumerate(option_name, i) result(option)
+      use elpa_generated_fortran_interfaces
+      character(kind=c_char, len=*), intent(in) :: option_name
+      integer, intent(in)                       :: i
+      integer                                   :: option
+      option = elpa_option_enumerate_c(option_name // C_NULL_CHAR, i)
+    end function
+
+end module
diff -Nru elpa-2016.05.001/src/elpa_api_math_template.F90 elpa-2019.11.001/src/elpa_api_math_template.F90
--- elpa-2016.05.001/src/elpa_api_math_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_api_math_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,517 @@
+
+  !> \brief abstract definition of interface to solve double real eigenvalue problem
+  !>
+  !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+  !>  blocksize, the number of eigenvectors
+  !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+  !>  class method "set"
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d
+  !> \param   a           double real matrix a: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+  !> \param   q           double real matrix q: on output stores the eigenvectors
+#endif
+#if ELPA_IMPL_SUFFIX == f
+  !> \param   a           single real matrix a: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+  !> \param   q           single real matrix q: on output stores the eigenvectors
+#endif  
+#if ELPA_IMPL_SUFFIX == dc
+  !> \param   a           double complex matrix a: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+  !> \param   q           double complex matrix q: on output stores the eigenvectors
+#endif  
+#if ELPA_IMPL_SUFFIX == fc
+  !> \param   a           single complex matrix a: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+  !> \param   q           single complex matrix q: on output stores the eigenvectors
+#endif
+  !> \result  error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_eigenvectors_&
+           &ELPA_IMPL_SUFFIX&
+           &_i(self, a, ev, q, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)       :: self
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *), q(self%local_nrows,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols), q(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+
+#ifdef USE_FORTRAN2008
+      integer, optional   :: error
+#else
+      integer             :: error
+#endif
+    end subroutine
+  end interface
+
+  !> \brief abstract definition of interface to solve double real skew-symmetric eigenvalue problem
+  !>
+  !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+  !>  blocksize, the number of eigenvectors
+  !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+  !>  class method "set"
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d
+  !> \param   a           double real matrix a: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+  !> \param   q           double real matrix q: on output stores the eigenvectors
+#endif
+#if ELPA_IMPL_SUFFIX == f
+  !> \param   a           single real matrix a: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+  !> \param   q           single real matrix q: on output stores the eigenvectors
+#endif  
+  !> \result  error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_skew_eigenvectors_&
+           &ELPA_IMPL_SUFFIX&
+           &_i(self, a, ev, q, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)       :: self
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *), q(self%local_nrows,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols), q(self%local_nrows, 2*self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+
+#ifdef USE_FORTRAN2008
+      integer, optional   :: error
+#else
+      integer             :: error
+#endif
+    end subroutine
+  end interface
+
+  !> \brief abstract definition of interface to solve a eigenvalue problem
+  !>
+  !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+  !>  blocksize, the number of eigenvectors
+  !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+  !>  class method "set"
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d
+  !> \param   a           double real matrix a: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == f
+  !> \param   a           single real matrix a: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == dc
+  !> \param   a           double complex matrix a: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+#endif  
+#if ELPA_IMPL_SUFFIX ==fc
+  !> \param   a           single complex matrix a: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+#endif  
+  !> \result  error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_eigenvalues_&
+        &ELPA_IMPL_SUFFIX&
+        &_i(self, a, ev, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)       :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+
+#ifdef USE_FORTRAN2008
+      integer, optional   :: error
+#else
+      integer             :: error
+#endif
+    end subroutine
+  end interface       
+
+
+  !> \brief abstract definition of interface to solve a skew-symmetric eigenvalue problem
+  !>
+  !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+  !>  blocksize, the number of eigenvectors
+  !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+  !>  class method "set"
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d
+  !> \param   a           double real matrix a: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == f
+  !> \param   a           single real matrix a: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+#endif
+  !> \result  error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_skew_eigenvalues_&
+        &ELPA_IMPL_SUFFIX&
+        &_i(self, a, ev, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)       :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+
+#ifdef USE_FORTRAN2008
+      integer, optional   :: error
+#else
+      integer             :: error
+#endif
+    end subroutine
+  end interface       
+
+
+  !> \brief abstract definition of interface to solve a generalized eigenvalue problem
+  !>
+  !>  The dimensions of the matrix a and b (locally ditributed and global), the block-cyclic distribution
+  !>  blocksize, the number of eigenvectors
+  !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+  !>  class method "set"
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d   
+  !> \param   a           double real matrix a: defines the problem to solve
+  !> \param   b           double real matrix b: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+  !> \param   q           double real matrix q: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == f  
+  !> \param   a           single real matrix a: defines the problem to solve
+  !> \param   b           single real matrix b: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+  !> \param   q           single real matrix q: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == dc  
+  !> \param   a           double complex matrix a: defines the problem to solve
+  !> \param   b           double complex matrix b: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+  !> \param   q           double complex matrix q: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == fc
+  !> \param   a           single complex matrix a: defines the problem to solve
+  !> \param   b           single complex matrix b: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+  !> \param   q           single complex matrix q: on output stores the eigenvalues
+#endif
+
+  !> \param   is_already_decomposed   logical, input: is it repeated call with the same b (decomposed in the fist call)?
+  !> \result  error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_generalized_eigenvectors_&
+           &ELPA_IMPL_SUFFIX&
+           &_i(self, a, b, ev, q, is_already_decomposed, error)
+      use iso_c_binding
+      use elpa_constants
+      import elpa_t
+      implicit none
+      class(elpa_t)       :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *), b(self%local_nrows, *), q(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols), b(self%local_nrows, self%local_ncols), &
+                             q(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+
+      logical             :: is_already_decomposed
+      integer, optional   :: error
+    end subroutine
+  end interface
+
+  !> \brief abstract definition of interface to solve a generalized eigenvalue problem
+  !>
+  !>  The dimensions of the matrix a and b (locally ditributed and global), the block-cyclic distribution
+  !>  blocksize, the number of eigenvectors
+  !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+  !>  class method "set"
+  !> Parameters
+  !> \details
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d   
+  !> \param   a           double real matrix a: defines the problem to solve
+  !> \param   b           double real matrix b: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == f  
+  !> \param   a           single real matrix a: defines the problem to solve
+  !> \param   b           single real matrix b: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == dc  
+  !> \param   a           double complex matrix a: defines the problem to solve
+  !> \param   b           double complex matrix b: defines the problem to solve
+  !> \param   ev          double real: on output stores the eigenvalues
+#endif
+#if ELPA_IMPL_SUFFIX == fc
+  !> \param   a           single complex matrix a: defines the problem to solve
+  !> \param   b           single complex matrix b: defines the problem to solve
+  !> \param   ev          single real: on output stores the eigenvalues
+#endif
+
+  !> \param   is_already_decomposed   logical, input: is it repeated call with the same b (decomposed in the fist call)?
+  !> \result  error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_generalized_eigenvalues_&
+           &ELPA_IMPL_SUFFIX&
+           &_i(self, a, b, ev, is_already_decomposed, error)
+      use iso_c_binding
+      use elpa_constants
+      import elpa_t
+      implicit none
+      class(elpa_t)       :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *), b(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols), b(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+
+      logical             :: is_already_decomposed
+      integer, optional   :: error
+    end subroutine
+  end interface
+
+
+  !> \brief abstract definition of interface to compute C : = A**T * B
+  !>         where   A is a square matrix (self%a,self%na) which is optionally upper or lower triangular
+  !>                 B is a (self%na,ncb) matrix
+  !>                 C is a (self%na,ncb) matrix where optionally only the upper or lower
+  !>                   triangle may be computed
+  !>
+  !> the MPI commicators are already known to the type. Thus the class method "setup" must be called
+  !> BEFORE this method is used
+  !> \details
+  !>
+  !> \param   self                class(elpa_t), the ELPA object
+  !> \param  uplo_a               'U' if A is upper triangular
+  !>                              'L' if A is lower triangular
+  !>                              anything else if A is a full matrix
+  !>                              Please note: This pertains to the original A (as set in the calling program)
+  !>                                           whereas the transpose of A is used for calculations
+  !>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
+  !>                              i.e. it may contain arbitrary numbers
+  !> \param uplo_c                'U' if only the upper diagonal part of C is needed
+  !>                              'L' if only the upper diagonal part of C is needed
+  !>                              anything else if the full matrix C is needed
+  !>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
+  !>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
+  !> \param ncb                   Number of columns  of global matrices B and C
+  !> \param a                     matrix a
+  !> \param self%local_nrows      number of rows of local (sub) matrix a, set with method set("local_nrows,value")
+  !> \param self%local_ncols      number of columns of local (sub) matrix a, set with method set("local_ncols,value")
+  !> \param b                     matrix b
+  !> \param nrows_b               number of rows of local (sub) matrix b
+  !> \param ncols_b               number of columns of local (sub) matrix b
+  !> \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
+  !> \param c                     matrix c
+  !> \param nrows_c               number of rows of local (sub) matrix c
+  !> \param ncols_c               number of columns of local (sub) matrix c
+  !> \param error                 optional argument, error code which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_hermitian_multiply_&
+        &ELPA_IMPL_SUFFIX&
+        &_i (self,uplo_a, uplo_c, ncb, a, b, nrows_b, ncols_b, &
+                                          c, nrows_c, ncols_c, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                   :: self
+      character*1                     :: uplo_a, uplo_c
+      integer(kind=c_int), intent(in) :: nrows_b, ncols_b, nrows_c, ncols_c, ncb
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows,*), b(nrows_b,*), c(nrows_c,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows,self%local_ncols), b(nrows_b,ncols_b), c(nrows_c,ncols_c)
+#endif
+
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+    end subroutine
+  end interface
+
+
+  !> \brief abstract definition of interface to do a cholesky decomposition of a matrix
+  !>
+  !>  The dimensions of the matrix a (locally ditributed and global), the block-cylic-distribution
+  !>  block size, and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !> Parameters
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d
+  !> \param   a           double real matrix: the matrix to be decomposed
+#endif
+#if ELPA_IMPL_SUFFIX == f
+  !> \param   a           single real matrix: the matrix to be decomposed
+#endif
+#if ELPA_IMPL_SUFFIX == dc
+  !> \param   a           double complex matrix: the matrix to be decomposed
+#endif
+#if ELPA_IMPL_SUFFIX == fc
+  !> \param   a           single complex matrix: the matrix to be decomposed
+#endif
+  !> \param   error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_cholesky_&
+          &ELPA_IMPL_SUFFIX&
+          &_i (self, a, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                   :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows,self%local_ncols)
+#endif
+
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+    end subroutine
+  end interface
+
+
+
+  !> \brief abstract definition of interface to invert a triangular matrix
+  !>
+  !>  The dimensions of the matrix a (locally ditributed and global), the block-cylic-distribution
+  !>  block size, and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !> Parameters
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d
+  !> \param   a           double real matrix: the matrix to be inverted
+#endif
+#if ELPA_IMPL_SUFFIX == f
+  !> \param   a           single real matrix: the matrix to be inverted
+#endif
+#if ELPA_IMPL_SUFFIX == dc
+  !> \param   a           double complex matrix: the matrix to be inverted
+#endif
+#if ELPA_IMPL_SUFFIX == fc
+  !> \param   a           single complex matrix: the matrix to be inverted
+#endif
+
+  !> \param   error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_invert_trm_&
+        &ELPA_IMPL_SUFFIX&
+        &_i (self, a, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                   :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows,self%local_ncols)
+#endif
+
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+    end subroutine
+  end interface
+
+
+
+  !> \brief abstract definition of interface to solve the eigenvalue problem for a valued tridiangular matrix
+  !>
+  !>  The dimensions of the matrix a (locally ditributed and global), the block-cylic-distribution
+  !>  block size, and the MPI communicators are already known to the object and MUST be set BEFORE
+  !>  with the class method "setup"
+  !>
+  !> Parameters
+  !> \param   self        class(elpa_t), the ELPA object
+#if ELPA_IMPL_SUFFIX == d
+  !> \param   d           double real 1d array: the diagonal elements of a matrix defined in setup, on output the eigenvalues
+  !>                      in ascending order
+  !> \param   e           double real 1d array: the subdiagonal elements of a matrix defined in setup
+  !> \param   q           double real matrix: on output contains the eigenvectors
+#endif
+#if ELPA_IMPL_SUFFIX == f
+  !> \param   d           single real 1d array: the diagonal elements of a matrix defined in setup, on output the eigenvalues
+  !>                      in ascending order
+  !> \param   e           single real 1d array: the subdiagonal elements of a matrix defined in setup
+  !> \param   q           single real matrix: on output contains the eigenvectors
+#endif
+  !> \param   error       integer, optional : error code, which can be queried with elpa_strerr
+  abstract interface
+    subroutine elpa_solve_tridiagonal_&
+          &ELPA_IMPL_SUFFIX&
+          &_i (self, d, e, q, error)
+      use iso_c_binding
+      import elpa_t
+      implicit none
+      class(elpa_t)                   :: self
+      real(kind=C_REAL_DATATYPE)        :: d(self%na), e(self%na)
+#ifdef USE_ASSUMED_SIZE
+      real(kind=C_REAL_DATATYPE)        :: q(self%local_nrows,*)
+#else
+      real(kind=C_REAL_DATATYPE)        :: q(self%local_nrows,self%local_ncols)
+#endif
+
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+    end subroutine
+  end interface
+
diff -Nru elpa-2016.05.001/src/elpa_autotune_impl.F90 elpa-2019.11.001/src/elpa_autotune_impl.F90
--- elpa-2016.05.001/src/elpa_autotune_impl.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_autotune_impl.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,57 @@
+#include "config-f90.h"
+
+module elpa_autotune_impl
+  use elpa_abstract_impl
+  use, intrinsic :: iso_c_binding
+  implicit none
+#ifdef ENABLE_AUTOTUNING
+  type, extends(elpa_autotune_t) :: elpa_autotune_impl_t
+    class(elpa_abstract_impl_t), pointer :: parent => NULL()
+    integer :: current = 0
+    real(kind=C_DOUBLE) :: min_val = 0.0_C_DOUBLE
+    integer :: min_loc = 0
+    integer :: cardinality = 0
+    integer :: level = 0
+    integer :: domain = 0
+    contains
+      procedure, public :: print => elpa_autotune_print
+      procedure, public :: destroy => elpa_autotune_destroy
+  end type
+
+  contains
+
+    !> \brief function to print the autotuning
+    !> Parameters
+    !> \param   self  class(elpa_autotune_impl_t) the allocated ELPA autotune object
+    subroutine elpa_autotune_print(self, error)
+      implicit none
+      class(elpa_autotune_impl_t), intent(in) :: self
+#ifdef USE_FORTRAN2008
+      integer, intent(out), optional :: error
+#else
+      integer, intent(out)           :: error
+#endif
+    end subroutine
+
+    !> \brief function to destroy an elpa autotune object
+    !> Parameters
+    !> \param   self  class(elpa_autotune_impl_t) the allocated ELPA autotune object
+    !> \param   error integer, optional error code
+    subroutine elpa_autotune_destroy(self, error)
+      implicit none
+      class(elpa_autotune_impl_t), intent(inout) :: self
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)             :: error
+#else
+      integer, intent(out)                       :: error
+#endif
+      
+      ! nothing to do atm
+#ifdef USE_FORTRAN2008
+      if (present(error)) error = ELPA_OK
+#else
+      error = ELPA_OK
+#endif
+    end subroutine
+#endif
+end module
diff -Nru elpa-2016.05.001/src/elpa_c_interface.c elpa-2019.11.001/src/elpa_c_interface.c
--- elpa-2016.05.001/src/elpa_c_interface.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_c_interface.c	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,72 @@
+/*
+!
+!    Copyright 2017, L. Hüdepohl and A. Marek, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+*/
+
+#include <elpa/elpa.h>
+
+/*
+ !pf> interface
+ !pf>   pure function elpa_strerr_c(elpa_error) result(string) bind(C, name="elpa_strerr")
+ !pf>     import c_int, c_ptr
+ !pf>     integer(kind=c_int), intent(in), value :: elpa_error
+ !pf>     type(c_ptr) :: string
+ !pf>   end function
+ !pf> end interface
+ */
+const char *elpa_strerr(int elpa_error) {
+#define NAME_CASE(name, value) \
+        case value: \
+                return #name;
+
+        switch(elpa_error) {
+                ELPA_FOR_ALL_ERRORS(NAME_CASE)
+                default:
+                        return "(Unknown error code)";
+        }
+}
diff -Nru elpa-2016.05.001/src/elpa_c_interface.F90 elpa-2019.11.001/src/elpa_c_interface.F90
--- elpa-2016.05.001/src/elpa_c_interface.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa_c_interface.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,311 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MCPDF
-#include "config-f90.h"
-  !c> #include <complex.h>
-
-  !c> /*! \brief C old, deprecated interface to create the MPI communicators for ELPA
-  !c> *
-  !c> * \param mpi_comm_word    MPI global communicator (in)
-  !c> * \param my_prow          Row coordinate of the calling process in the process grid (in)
-  !c> * \param my_pcol          Column coordinate of the calling process in the process grid (in)
-  !c> * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
-  !c> * \result int             integer error value of mpi_comm_split function
-  !c> */
-  !c> int elpa_get_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
-  function get_elpa_row_col_comms_wrapper_c_name1(mpi_comm_world, my_prow, my_pcol, &
-                                          mpi_comm_rows, mpi_comm_cols)     &
-                                          result(mpierr) bind(C,name="elpa_get_communicators")
-    use, intrinsic :: iso_c_binding
-    use elpa1, only : get_elpa_row_col_comms
-
-    implicit none
-    integer(kind=c_int)         :: mpierr
-    integer(kind=c_int), value  :: mpi_comm_world, my_prow, my_pcol
-    integer(kind=c_int)         :: mpi_comm_rows, mpi_comm_cols
-
-    mpierr = get_elpa_row_col_comms(mpi_comm_world, my_prow, my_pcol, &
-                                    mpi_comm_rows, mpi_comm_cols)
-
-  end function
-  !c> #include <complex.h>
-
-  !c> /*! \brief C interface to create the MPI communicators for ELPA
-  !c> *
-  !c> * \param mpi_comm_word    MPI global communicator (in)
-  !c> * \param my_prow          Row coordinate of the calling process in the process grid (in)
-  !c> * \param my_pcol          Column coordinate of the calling process in the process grid (in)
-  !c> * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
-  !c> * \result int             integer error value of mpi_comm_split function
-  !c> */
-  !c> int get_elpa_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
-  function get_elpa_row_col_comms_wrapper_c_name2(mpi_comm_world, my_prow, my_pcol, &
-                                          mpi_comm_rows, mpi_comm_cols)     &
-                                          result(mpierr) bind(C,name="get_elpa_communicators")
-    use, intrinsic :: iso_c_binding
-    use elpa1, only : get_elpa_row_col_comms
-
-    implicit none
-    integer(kind=c_int)         :: mpierr
-    integer(kind=c_int), value  :: mpi_comm_world, my_prow, my_pcol
-    integer(kind=c_int)         :: mpi_comm_rows, mpi_comm_cols
-
-    mpierr = get_elpa_row_col_comms(mpi_comm_world, my_prow, my_pcol, &
-                                    mpi_comm_rows, mpi_comm_cols)
-
-  end function
-
-
-
-  !c>  /*! \brief C interface to solve the real eigenvalue problem with 1-stage solver
-  !c>  *
-  !c> *  \param  na                   Order of matrix a
-  !c> *  \param  nev                  Number of eigenvalues needed.
-  !c> *                               The smallest nev eigenvalues/eigenvectors are calculated.
-  !c> *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
-  !c> *                               Distribution is like in Scalapack.
-  !c> *                               The full matrix must be set (not only one half like in scalapack).
-  !c> *  \param lda                   Leading dimension of a
-  !c> *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
-  !c> *  \param q                     On output: Eigenvectors of a
-  !c> *                               Distribution is like in Scalapack.
-  !c> *                               Must be always dimensioned to the full size (corresponding to (na,na))
-  !c> *                               even if only a part of the eigenvalues is needed.
-  !c> *  \param ldq                   Leading dimension of q
-  !c> *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
-  !c> *  \param matrixCols           distributed number of matrix columns
-  !c> *  \param mpi_comm_rows        MPI-Communicator for rows
-  !c> *  \param mpi_comm_cols        MPI-Communicator for columns
-  !c> *
-  !c> *  \result                     int: 1 if error occured, otherwise 0
-  !c>*/
-  !c> int elpa_solve_evp_real_1stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols);
-  function solve_elpa1_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols)      &
-                                  result(success) bind(C,name="elpa_solve_evp_real_1stage")
-
-    use, intrinsic :: iso_c_binding
-    use elpa1, only : solve_evp_real
-
-    implicit none
-    integer(kind=c_int)                    :: success
-    integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows
-    real(kind=c_double)                    :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols)
-
-    logical                                :: successFortran
-
-    successFortran = solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    if (successFortran) then
-      success = 1
-    else
-      success = 0
-    endif
-
-  end function
-
-
-  !c> /*! \brief C interface to solve the complex eigenvalue problem with 1-stage solver
-  !c> *
-  !c> *  \param  na                   Order of matrix a
-  !c> *  \param  nev                  Number of eigenvalues needed.
-  !c> *                               The smallest nev eigenvalues/eigenvectors are calculated.
-  !c> *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
-  !c> *                               Distribution is like in Scalapack.
-  !c> *                               The full matrix must be set (not only one half like in scalapack).
-  !c> *  \param lda                   Leading dimension of a
-  !c> *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
-  !c> *  \param q                     On output: Eigenvectors of a
-  !c> *                               Distribution is like in Scalapack.
-  !c> *                               Must be always dimensioned to the full size (corresponding to (na,na))
-  !c> *                               even if only a part of the eigenvalues is needed.
-  !c> *  \param ldq                   Leading dimension of q
-  !c> *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
-  !c> *  \param matrixCols           distributed number of matrix columns
-  !c> *  \param mpi_comm_rows        MPI-Communicator for rows
-  !c> *  \param mpi_comm_cols        MPI-Communicator for columns
-  !c> *
-  !c> *  \result                     int: 1 if error occured, otherwise 0
-  !c> */
-  !c> int elpa_solve_evp_complex_1stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols);
-  function solve_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols)      &
-                                  result(success) bind(C,name="elpa_solve_evp_complex_1stage")
-
-    use, intrinsic :: iso_c_binding
-    use elpa1, only : solve_evp_complex
-
-    implicit none
-    integer(kind=c_int)                    :: success
-    integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows
-    complex(kind=c_double_complex)         :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols)
-    real(kind=c_double)                    :: ev(1:na)
-
-    logical                                :: successFortran
-
-    successFortran = solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    if (successFortran) then
-      success = 1
-    else
-      success = 0
-    endif
-
-  end function
-  !c> /*! \brief C interface to solve the real eigenvalue problem with 2-stage solver
-  !c> *
-  !c> *  \param  na                        Order of matrix a
-  !c> *  \param  nev                       Number of eigenvalues needed.
-  !c> *                                    The smallest nev eigenvalues/eigenvectors are calculated.
-  !c> *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
-  !c> *                                    Distribution is like in Scalapack.
-  !c> *                                    The full matrix must be set (not only one half like in scalapack).
-  !c> *  \param lda                        Leading dimension of a
-  !c> *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
-  !c> *  \param q                          On output: Eigenvectors of a
-  !c> *                                    Distribution is like in Scalapack.
-  !c> *                                    Must be always dimensioned to the full size (corresponding to (na,na))
-  !c> *                                    even if only a part of the eigenvalues is needed.
-  !c> *  \param ldq                        Leading dimension of q
-  !c> *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
-  !c> *  \param matrixCols                 distributed number of matrix columns
-  !c> *  \param mpi_comm_rows              MPI-Communicator for rows
-  !c> *  \param mpi_comm_cols              MPI-Communicator for columns
-  !c> *  \param mpi_coll_all               MPI communicator for the total processor set
-  !c> *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
-  !c> *  \param use_qr                     use QR decomposition 1 = yes, 0 = no
-  !c> *
-  !c> *  \result                     int: 1 if error occured, otherwise 0
-  !c> */
-  !c> int elpa_solve_evp_real_2stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR);
-  function solve_elpa2_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk,    &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
-                                  THIS_REAL_ELPA_KERNEL_API, useQR)           &
-                                  result(success) bind(C,name="elpa_solve_evp_real_2stage")
-
-    use, intrinsic :: iso_c_binding
-    use elpa2, only : solve_evp_real_2stage
-
-    implicit none
-    integer(kind=c_int)                    :: success
-    integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows, &
-                                              mpi_comm_all
-    integer(kind=c_int), value, intent(in) :: THIS_REAL_ELPA_KERNEL_API, useQR
-    real(kind=c_double)                    :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols)
-
-
-
-    logical                                :: successFortran, useQRFortran
-
-    if (useQR .eq. 0) then
-      useQRFortran =.false.
-    else
-      useQRFortran = .true.
-    endif
-
-    successFortran = solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, &
-                                           mpi_comm_cols, mpi_comm_all,                                  &
-                                           THIS_REAL_ELPA_KERNEL_API, useQRFortran)
-
-    if (successFortran) then
-      success = 1
-    else
-      success = 0
-    endif
-
-  end function
-
-
-  !c> /*! \brief C interface to solve the complex eigenvalue problem with 2-stage solver
-  !c> *
-  !c> *  \param  na                        Order of matrix a
-  !c> *  \param  nev                       Number of eigenvalues needed.
-  !c> *                                    The smallest nev eigenvalues/eigenvectors are calculated.
-  !c> *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
-  !c> *                                    Distribution is like in Scalapack.
-  !c> *                                    The full matrix must be set (not only one half like in scalapack).
-  !c> *  \param lda                        Leading dimension of a
-  !c> *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
-  !c> *  \param q                          On output: Eigenvectors of a
-  !c> *                                    Distribution is like in Scalapack.
-  !c> *                                    Must be always dimensioned to the full size (corresponding to (na,na))
-  !c> *                                    even if only a part of the eigenvalues is needed.
-  !c> *  \param ldq                        Leading dimension of q
-  !c> *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
-  !c> *  \param matrixCols                 distributed number of matrix columns
-  !c> *  \param mpi_comm_rows              MPI-Communicator for rows
-  !c> *  \param mpi_comm_cols              MPI-Communicator for columns
-  !c> *  \param mpi_coll_all               MPI communicator for the total processor set
-  !c> *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
-  !c> *  \param use_qr                     use QR decomposition 1 = yes, 0 = no
-  !c> *
-  !c> *  \result                     int: 1 if error occured, otherwise 0
-  !c> */
-  !c> int elpa_solve_evp_complex_2stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API);
-  function solve_elpa2_evp_complex_wrapper(na, nev, a, lda, ev, q, ldq, nblk,    &
-                                  matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all,    &
-                                  THIS_COMPLEX_ELPA_KERNEL_API)                  &
-                                  result(success) bind(C,name="elpa_solve_evp_complex_2stage")
-
-    use, intrinsic :: iso_c_binding
-    use elpa2, only : solve_evp_complex_2stage
-
-    implicit none
-    integer(kind=c_int)                    :: success
-    integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows, &
-                                              mpi_comm_all
-    integer(kind=c_int), value, intent(in) :: THIS_COMPLEX_ELPA_KERNEL_API
-    complex(kind=c_double_complex)         :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols)
-    real(kind=c_double)                    :: ev(1:na)
-    logical                                :: successFortran
-
-    successFortran = solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
-                                              mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API)
-
-    if (successFortran) then
-      success = 1
-    else
-      success = 0
-    endif
-
-  end function
-
diff -Nru elpa-2016.05.001/src/elpa_constants.F90 elpa-2019.11.001/src/elpa_constants.F90
--- elpa-2016.05.001/src/elpa_constants.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_constants.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,56 @@
+!
+!    Copyright 2017, L. Hüdepohl and A. Marek, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+module elpa_constants
+  use, intrinsic :: iso_c_binding, only : C_INT
+  implicit none
+  public
+
+  integer(kind=C_INT),  parameter          :: SC_DESC_LEN = 9
+
+#include "src/fortran_constants.F90"
+end module
diff -Nru elpa-2016.05.001/src/elpa.F90 elpa-2019.11.001/src/elpa.F90
--- elpa-2016.05.001/src/elpa.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,462 @@
+!
+!    Copyright 2017, L. Hüdepohl and A. Marek, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+
+! The ELPA public API
+
+
+!> \mainpage
+!> Eigenvalue SoLvers for Petaflop-Applications (ELPA)
+!> \par
+!> http://elpa.mpcdf.mpg.de
+!>
+!> \par
+!>    The ELPA library was originally created by the ELPA consortium,
+!>    consisting of the following organizations:
+!>
+!>    - Max Planck Computing and Data Facility (MPCDF) formerly known as
+!>      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!>    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!>      Informatik,
+!>    - Technische Universität München, Lehrstuhl für Informatik mit
+!>      Schwerpunkt Wissenschaftliches Rechnen ,
+!>    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!>    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!>      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!>      and
+!>    - IBM Deutschland GmbH
+!>
+!>   Some parts and enhancements of ELPA have been contributed and authored
+!>   by the Intel Corporation and Nvidia Corporation, which are not part of
+!>   the ELPA consortium.
+!>
+!>   Maintainance and development of the ELPA library is done by the
+!>   Max Planck Computing and Data Facility (MPCDF)
+!>
+!>
+!>   Futher support of the ELPA library is done by the ELPA-AEO consortium,
+!>   consisting of the following organizations:
+!>
+!>    - Max Planck Computing and Data Facility (MPCDF) formerly known as
+!>      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!>    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!>      Informatik,
+!>    - Technische Universität München, Lehrstuhl für Informatik mit
+!>      Schwerpunkt Wissenschaftliches Rechnen ,
+!>    - Technische Universität München, Lehrstuhl für theoretische Chemie,
+!>    - Fritz-Haber-Institut, Berlin, Abt. Theorie
+!>
+!>
+!>   Contributions to the ELPA source have been authored by (in alphabetical order):
+!>
+!> \author T. Auckenthaler, Volker Blum, A. Heinecke, L. Huedepohl, R. Johanni, Werner Jürgens, Pavel Kus, and A. Marek
+!>
+!> All the important information is in the \ref elpa_api::elpa_t derived type
+!>
+!> \brief Abstract definition of the elpa_t type
+!>
+!>
+!> A typical usage of ELPA might look like this:
+!>
+!> Fortran synopsis
+!>
+!> \code{.f90}
+!>  use elpa
+!>  class(elpa_t), pointer :: elpa
+!>  integer :: success
+!>
+!>  ! We urge the user to always check the error code of all ELPA functions
+!>
+!>  if (elpa_init(20191110) /= ELPA_OK) then
+!>     print *, "ELPA API version not supported"
+!>     stop
+!>   endif
+!>   elpa => elpa_allocate(success)
+!>   if (success /= ELPA_OK) then
+!>     print *,"Could not allocate ELPA"
+!>   endif
+!>
+!>   ! set parameters decribing the matrix and it's MPI distribution
+!>   call elpa%set("na", na, success, success)
+!>   if (success /= ELPA_OK) then
+!>     print *,"Could not set entry"
+!>   endif
+!>   call elpa%set("nev", nev, success, success)
+!>   ! check success code ...
+!>
+!>   call elpa%set("local_nrows", na_rows, success)
+!>   ! check success code ...
+!>
+!>   call elpa%set("local_ncols", na_cols, success)
+!>   call elpa%set("nblk", nblk, success)
+!>   call elpa%set("mpi_comm_parent", MPI_COMM_WORLD, success)
+!>   call elpa%set("process_row", my_prow, success)
+!>   call elpa%set("process_col", my_pcol, success)
+!>
+!>   ! set up the elpa object
+!>   success = elpa%setup()
+!>   if (succes /= ELPA_OK) then
+!>     print *,"Could not setup ELPA object"
+!>   endif
+!>
+!>   ! if desired, set tunable run-time options
+!>   ! here we want to use the 2-stage solver
+!>   call elpa%set("solver", ELPA_SOLVER_2STAGE, success)
+!>
+!>   ! and set a specific kernel (must be supported on the machine)
+!>   call elpa%set("real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2)
+!> \endcode
+!>   ... set and get all other options that are desired
+!> \code{.f90}
+!>
+!>   ! if wanted you can store the settings and load them in another program
+!>   call elpa%store_settings("save_to_disk.txt", success)
+!>
+!>   ! use method solve to solve the eigenvalue problem to obtain eigenvalues
+!>   ! and eigenvectors
+!>   ! other possible methods are desribed in \ref elpa_api::elpa_t derived type
+!>   call elpa%eigenvectors(a, ev, z, success)
+!>
+!>   ! cleanup
+!>   call elpa_deallocate(e, success)
+!>
+!>   call elpa_uninit()
+!> \endcode
+!>
+!>
+!> C synopsis
+!>
+!>  \code{.c}
+!>   #include <elpa/elpa.h>
+!>
+!>   elpa_t handle;
+!>   int error;
+!>
+!>   /*  We urge the user to always check the error code of all ELPA functions */
+!>
+!>   if (elpa_init(20191110) != ELPA_OK) {
+!>     fprintf(stderr, "Error: ELPA API version not supported");
+!>     exit(1);
+!>   }
+!>
+!>   
+!>   handle = elpa_allocate(&error);
+!>   if (error != ELPA_OK) {
+!>   /* do sth. */
+!>   }
+!>
+!>   /* Set parameters the matrix and it's MPI distribution */
+!>   elpa_set(handle, "na", na, &error);
+!>   elpa_set(handle, "nev", nev, &error);
+!>   elpa_set(handle, "local_nrows", na_rows, &error);
+!>   elpa_set(handle, "local_ncols", na_cols, &error);
+!>   elpa_set(handle, "nblk", nblk, &error);
+!>   elpa_set(handle, "mpi_comm_parent", MPI_Comm_c2f(MPI_COMM_WORLD), &error);
+!>   elpa_set(handle, "process_row", my_prow, &error);
+!>   elpa_set(handle, "process_col", my_pcol, &error);
+!>
+!>   /* Setup */
+!>   error = elpa_setup(handle);
+!>
+!>   /* if desired, set tunable run-time options */
+!>   /* here we want to use the 2-stage solver */
+!>   elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error);
+!>
+!>   elpa_set(handle,"real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, &error);
+!>  \endcode
+!>   ... set and get all other options that are desired
+!>  \code{.c}
+!>
+!>   /* if you want you can store the settings and load them in another program */
+!>   elpa_store_settings(handle, "save_to_disk.txt");
+!>
+!>   /* use method solve to solve the eigenvalue problem */
+!>   /* other possible methods are desribed in \ref elpa_api::elpa_t derived type */
+!>   elpa_eigenvectors(handle, a, ev, z, &error);
+!>
+!>   /* cleanup */
+!>   elpa_deallocate(handle, &error);
+!>   elpa_uninit();
+!> \endcode
+!>
+!> the autotuning could be used like this:
+!>
+!> Fortran synopsis
+!>
+!> \code{.f90}
+!>  use elpa
+!>  class(elpa_t), pointer :: elpa
+!>  class(elpa_autotune_t), pointer :: tune_state
+!>  integer :: success
+!>
+!>  if (elpa_init(20191110) /= ELPA_OK) then
+!>     print *, "ELPA API version not supported"
+!>     stop
+!>   endif
+!>   elpa => elpa_allocate(success)
+!>
+!>   ! set parameters decribing the matrix and it's MPI distribution
+!>   call elpa%set("na", na, success)
+!>   call elpa%set("nev", nev, success)
+!>   call elpa%set("local_nrows", na_rows, success)
+!>   call elpa%set("local_ncols", na_cols, success)
+!>   call elpa%set("nblk", nblk, success)
+!>   call elpa%set("mpi_comm_parent", MPI_COMM_WORLD, success)
+!>   call elpa%set("process_row", my_prow, success)
+!>   call elpa%set("process_col", my_pcol, success)
+!>
+!>   ! set up the elpa object
+!>   success = elpa%setup()
+!>
+!>   ! create autotune object
+!>   tune_state => elpa%autotune_setup(ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, success)
+!>
+!>   ! you can set some options, these will be then FIXED for the autotuning step
+!>   ! if desired, set tunable run-time options
+!>   ! here we want to use the 2-stage solver
+!>   call e%set("solver", ELPA_SOLVER_2STAGE, success)
+!>
+!>   ! and set a specific kernel (must be supported on the machine)
+!>   call e%set("real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, success)
+!> \endcode
+!>   ... set and get all other options that are desired
+!> \code{.f90}
+!>
+!>   iter = 0
+!>   do while (elpa%autotune_step(tune_state, success))
+!>     iter = iter + 1
+!>     call e%eigenvectors(a, ev, z, success)
+!>
+!>     ! if needed you can save the autotune state at any point
+!>     ! and resume it
+!>     if (iter > MAX_ITER) then
+!>       call elpa%autotune_save_state(tune_state,"autotune_checkpoint.txt", success)
+!>       exit
+!>     endif
+!>   enddo
+!>
+!>   !set and print the finished autotuning
+!>   call elpa%autotune_set_best(tune_state, success)
+!>   
+!>   ! store _TUNED_ ELPA object, if needed
+!>   call elpa%store("autotuned_object.txt", success)
+!>
+!>   !deallocate autotune object
+!>   call elpa_autotune_deallocate(tune_state, success)
+!>
+!>   ! cleanup
+!>   call elpa_deallocate(e, success)
+!>
+!>   call elpa_uninit()
+!> \endcode
+!>
+!> More examples can be found in the folder "test", where Fortran and C example programs
+!> are stored
+
+!> \brief Fortran module to use the ELPA library. No other module shoule be used
+
+#include "config-f90.h"
+module elpa
+  use elpa_constants
+  use elpa_api
+
+  implicit none
+  public
+
+  contains
+
+    !> \brief function to allocate an ELPA instance
+    !> Parameters
+    !> \details
+    !> \params  error      integer, optional : error code
+    !> \result  obj        class(elpa_t), pointer : pointer to allocated object
+    function elpa_allocate(error) result(obj)
+      use elpa_impl
+      class(elpa_t), pointer         :: obj
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out) :: error
+#else
+      integer, intent(out)           :: error
+#endif
+      integer                        :: error2
+
+      obj => elpa_impl_allocate(error2)
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+#endif
+        error = error2
+        if (error .ne. ELPA_OK) then
+          write(*,*) "Cannot allocate the ELPA object!"
+          write(*,*) "This is a critical error!"
+          write(*,*) "ELPA not usable with this error"
+        endif
+#ifdef USE_FORTRAN2008
+      else
+        if (error2 .ne. ELPA_OK) then
+          write(*,*) "Cannot allocate the ELPA object!"
+          write(*,*) "This is a critical error, but you do not check the error codes!"
+          write(*,*) "ELPA not usable with this error"
+          stop
+        endif
+      endif
+#endif
+
+    end function
+
+
+    !> \brief function to deallocate an ELPA instance
+    !> Parameters
+    !> \details
+    !> \param  obj        class(elpa_t), pointer : pointer to the ELPA object to be destroyed and deallocated
+    !> \param  error      integer, optional : error code
+    subroutine elpa_deallocate(obj, error)
+      class(elpa_t), pointer         :: obj
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out) :: error
+#else
+      integer, intent(out)           :: error
+#endif
+      integer                        :: error2
+        
+      call obj%destroy(error2)
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+#endif
+        error = error2
+        if (error .ne. ELPA_OK) then
+          write(*,*) "Cannot destroy the ELPA object!"  
+          write(*,*) "This is a critical error!"  
+          write(*,*) "This might lead to a memory leak in your application!"
+          error = ELPA_ERROR_CRITICAL
+          return
+        endif
+#ifdef USE_FORTRAN2008
+      else
+        if (error2 .ne. ELPA_OK) then
+          write(*,*) "Cannot destroy the ELPA object!"
+          write(*,*) "This is a critical error!"
+          write(*,*) "This might lead to a memory leak in your application!"
+          write(*,*) "But you do not check the error codes!"
+          return
+        endif
+      endif
+#endif
+      deallocate(obj, stat=error2)
+      if (error2 .ne. 0) then
+        write(*,*) "Cannot deallocate the ELPA object!"  
+        write(*,*) "This is a critical error!"  
+        write(*,*) "This might lead to a memory leak in your application!"
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_CRITICAL
+          return
+        endif
+#else
+        error = ELPA_ERROR_CRITICAL
+        return
+#endif
+      endif
+    end subroutine
+
+#ifdef ENABLE_AUTOTUNING
+    !> \brief function to deallocate an ELPA autotune instance
+    !> Parameters
+    !> \details
+    !> \param  obj        class(elpa_autotune_t), pointer : pointer to the autotune object to be destroyed and deallocated
+    !> \param  error      integer, optional : error code
+    subroutine elpa_autotune_deallocate(obj, error)
+      class(elpa_autotune_t), pointer :: obj
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)  :: error
+#else
+      integer, intent(out)            :: error
+#endif
+      integer                         :: error2
+      call obj%destroy(error2)
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+#endif
+        error = error2
+        if (error2 .ne. ELPA_OK) then
+          write(*,*) "Cannot destroy the ELPA autotuning object!"
+          write(*,*) "This is a critical error!"
+          write(*,*) "This might lead to a memory leak in your application!"
+          error = ELPA_ERROR_CRITICAL
+          return
+        endif
+#ifdef USE_FORTRAN2008
+      else
+        if (error2 .ne. ELPA_OK) then
+          write(*,*) "Cannot destroy the ELPA autotuning object!"
+          write(*,*) "This is a critical error!"
+          write(*,*) "This might lead to a memory leak in your application!"
+          write(*,*) "But you do not check the error codes"
+          return
+        endif
+      endif
+#endif
+      deallocate(obj, stat=error2)
+      if (error2 .ne. 0) then
+        write(*,*) "Cannot deallocate the ELPA autotuning object!"  
+        write(*,*) "This is a critical error!"  
+        write(*,*) "This might lead to a memory leak in your application!"
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_CRITICAL
+          return
+        endif
+#else
+        error = ELPA_ERROR_CRITICAL
+        return
+#endif
+      endif
+
+    end subroutine
+#endif
+
+end module
diff -Nru elpa-2016.05.001/src/elpa_generalized/cannon_back_template.c elpa-2019.11.001/src/elpa_generalized/cannon_back_template.c
--- elpa-2016.05.001/src/elpa_generalized/cannon_back_template.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_generalized/cannon_back_template.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,531 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file has been developed within the ELPA-AEO //
+//    project, which has been a joint effort of
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Technische Universität München, Lehrstuhl für Theoretische Chemie,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/ and
+//    http://elpa-aeo.mpcdf.mpg.de
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Valeriy Manin (Bergische Universität Wuppertal)
+// integreated into the ELPA library Pavel Kus, Andeas Marek (MPCDF)
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define C_INT_TYPE_PTR long int*
+#define C_INT_TYPE long int
+#define BLAS_KIND c_int64_t
+#else
+#define C_INT_TYPE_PTR int*
+#define C_INT_TYPE int
+#define BLAS_KIND c_int
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define C_INT_MPI_TYPE_PTR long int*
+#define C_INT_MPI_TYPE long int
+#define MPI_KIND c_int64_t
+#else
+#define C_INT_MPI_TYPE_PTR int*
+#define C_INT_MPI_TYPE int
+#define MPI_KIND c_int
+#endif
+
+// it seems, that we need those two levels of indirection to correctly expand macros
+#define cannons_triang_rectangular_impl_expand2(SUFFIX) cannons_triang_rectangular_##SUFFIX
+#define cannons_triang_rectangular_impl_expand1(SUFFIX) cannons_triang_rectangular_impl_expand2(SUFFIX)
+#define cannons_triang_rectangular_impl cannons_triang_rectangular_impl_expand1(ELPA_IMPL_SUFFIX)
+
+#define cannons_triang_rectangular_c_impl_expand2(SUFFIX) cannons_triang_rectangular_c_##SUFFIX
+#define cannons_triang_rectangular_c_impl_expand1(SUFFIX) cannons_triang_rectangular_c_impl_expand2(SUFFIX)
+#define cannons_triang_rectangular_c_impl cannons_triang_rectangular_c_impl_expand1(ELPA_IMPL_SUFFIX)
+
+void cannons_triang_rectangular_impl(math_type* U, math_type* B, C_INT_TYPE np_rows, C_INT_TYPE np_cols, C_INT_TYPE my_prow, C_INT_TYPE my_pcol, C_INT_TYPE_PTR U_desc, C_INT_TYPE_PTR b_desc, math_type *Res, MPI_Comm row_comm, MPI_Comm col_comm)
+{
+   // Cannons algorithm, Non-blocking version
+   // Input: 
+   //    - U is upper triangular matrix
+   //    - B is rectangular matrix
+   // Output: 
+   //    - Res is a full rectangular matrix Res = U*B
+   // row_comm: communicator along rows
+   // col_comm: communicator along columns
+   // This function will be used for a backtransformation
+  
+   C_INT_TYPE na, nb, nblk, width, na_rows, na_cols, nb_cols, cols_in_buffer_U_my_initial, cols_in_buffer_U, rows_in_buffer_U, Size_receive_U_now, rows_in_buffer_U_now, cols_in_buffer_U_now, rows_in_buffer_U_my_initial;
+
+   C_INT_MPI_TYPE Size_receive_U_nowMPI, Size_receive_UMPI, Size_receive_BMPI;
+   C_INT_TYPE i, j, Size_send_U, Size_receive_U, Size_send_B, Size_receive_B, intNumber, Buf_rows, Buf_cols_U, Buf_cols_B, curr_rows, num_of_iters, cols_in_buffer, rows_in_block, curr_col_loc, cols_in_block, num_of_blocks_in_U_buffer, col_of_origin_U, b_rows_mult, b_cols_mult; 
+   
+   math_type *Buf_to_send_U, *Buf_to_receive_U, *Buf_to_send_B, *Buf_to_receive_B, *Buf_U, *PosBuff;
+  
+   C_INT_TYPE where_to_send_U, from_where_to_receive_U, where_to_send_B, from_where_to_receive_B, last_proc_col_B, last_proc_row_B, n, Size_U_stored, proc_col_min; 
+   
+   math_type *U_local_start, *Buf_pos, *B_local_start, *double_ptr, *CopyTo, *CopyFrom;
+   
+   C_INT_TYPE ratio;
+   
+   MPI_Status status;
+
+   C_INT_TYPE one = 1;
+   C_INT_TYPE zero = 0; 
+   math_type done = 1.0;
+   math_type dzero = 0.0;
+      
+   na = U_desc[2];
+   nblk = U_desc[4]; 
+   nb = b_desc[3];
+   
+   na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
+   na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
+   nb_cols = numroc_(&nb, &nblk, &my_pcol, &zero, &np_cols);
+   
+   MPI_Request request_U_Recv; 
+   MPI_Request request_U_Send;
+   MPI_Request request_B_Recv; 
+   MPI_Request request_B_Send;
+   
+   ///////////////////////////////////////////////////////////////// Start of algorithm ///////////////////////////////////////////////////////////////////////////////////////////////
+   last_proc_col_B = ((nb-1)/nblk) % np_cols;
+   last_proc_row_B = ((na-1)/nblk) % np_rows;
+   
+   /////////////////////////memory allocation area//////////////////////////////////////////////////////////////
+   
+    if(nb%nblk == 0)
+      if(my_pcol <= last_proc_col_B)
+         Buf_cols_B = nb_cols;
+      else
+         Buf_cols_B = nb_cols + nblk;      
+   else
+      if(my_pcol < last_proc_col_B)
+         Buf_cols_B = nb_cols;
+      else if(my_pcol > last_proc_col_B)
+         Buf_cols_B = nb_cols + nblk; 
+      else  // if my_pcol == last_proc_col_B
+         Buf_cols_B = nb_cols + nblk - nb_cols%nblk;     
+   
+   if(na%nblk == 0)
+      if(my_prow <= last_proc_row_B)
+         Buf_rows = na_rows;
+      else
+         Buf_rows = na_rows + nblk;      
+   else
+      if(my_prow < last_proc_row_B)
+         Buf_rows = na_rows;
+      else if(my_prow > last_proc_row_B)
+         Buf_rows = na_rows + nblk; 
+      else  // if my_prow == last_proc_row_B
+         Buf_rows = na_rows + nblk - na_rows%nblk;  
+   
+   ratio = np_cols/np_rows; 
+   
+   intNumber = ceil((math_type)na/(math_type)(np_cols*nblk));   // max. possible number of the local block columns of U
+   Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2;   // number of local elements from the upper triangular part that every proc. has (max. possible value among all the procs.)
+   
+   Buf_to_send_U = malloc(ratio*Size_U_stored*sizeof(math_type));
+   Buf_to_receive_U = malloc(ratio*Size_U_stored*sizeof(math_type));
+   Buf_to_send_B = malloc(Buf_cols_B*Buf_rows*sizeof(math_type));
+   Buf_to_receive_B = malloc(Buf_cols_B*Buf_rows*sizeof(math_type));
+   if(ratio != 1)
+      Buf_U = malloc(Size_U_stored*sizeof(math_type));   // in this case we will receive data into initial buffer and after place block-rows to the needed positions of buffer for calculation
+    
+   for(i = 0; i < na_rows*nb_cols; i++)
+     Res[i] = 0; 
+    
+   /////////////////////////////////////////////////////////////// initial reordering of U ///////////////////////////////////////////////////////////////////////////////////////// 
+      
+   // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to <ratio> with the "leap" equal to np_rows; the same holds for receive  
+   if((ratio != 1)||(my_prow != 0))   // if grid is rectangular or my_prow is not 0
+      Buf_pos = Buf_to_send_U;     // I will copy to the send buffer
+   else
+      Buf_pos = Buf_to_receive_U;  // if grid is square and my_prow is 0, then I will copy to the received buffer
+      
+   // form array to send by block-columns; we need only upper triangular part
+   // find the first local block belonging to the upper part of matrix U
+   if(my_pcol >= my_prow)  // if I am in the upper part of proc. grid
+      curr_col_loc = 0;    // my first local block-column has block from the upper part of matrix
+   else
+      curr_col_loc = 1;   //ceil((math_type)(((math_type)my_prow - (math_type)my_pcol)/(math_type)np_cols)) always will give 1 since np_cols > np_rows 
+      
+   num_of_iters = ceil((math_type)na_cols/(math_type)nblk);             // number my of block-columns
+   num_of_iters = num_of_iters - curr_col_loc;   // I will exclude the first <curr_col_loc> block-columns since they do not have blocks from the upper part of matrix U
+   curr_col_loc = curr_col_loc*nblk;             // local index of the found block-column
+
+   if(my_pcol >= my_prow )
+      rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
+   else
+      rows_in_block = ratio*nblk;
+   cols_in_buffer_U_my_initial = 0;
+   Size_send_U = 0; 
+   for(i = 0; i < num_of_iters; i++)       // loop over my block-columns, which have blocks in the upepr part of U
+   {      
+      if(rows_in_block > na_rows)
+         rows_in_block = na_rows; 
+
+      if ((na_cols - curr_col_loc) < nblk)
+         cols_in_block = na_cols - curr_col_loc;     // how many columns do I have in the current block-column
+      else
+         cols_in_block = nblk; 
+      
+      if((rows_in_block > 0)&&(cols_in_block > 0))
+      {
+         double_ptr = &U[curr_col_loc*na_rows];   // pointer to start of the current block-column to be copied to buffer
+         C_LACPY("A", &rows_in_block, &cols_in_block, double_ptr, &na_rows, Buf_pos, &rows_in_block);     // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column 
+         Buf_pos = Buf_pos + rows_in_block*cols_in_block;                         // go to the position where the next block-column will be copied                                             
+         Size_send_U = Size_send_U + rows_in_block*cols_in_block; 
+         cols_in_buffer_U_my_initial = cols_in_buffer_U_my_initial + cols_in_block; 
+      }
+      curr_col_loc = curr_col_loc + nblk;      // go to the next local block-column of my local array U 
+      rows_in_block = rows_in_block + ratio*nblk;
+   }
+   rows_in_buffer_U_my_initial = rows_in_block - ratio*nblk;    // remove redundant addition from the previous loop 
+   *Buf_pos = (math_type)cols_in_buffer_U_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors
+   Buf_pos = Buf_pos + 1; 
+   *Buf_pos = (math_type)rows_in_buffer_U_my_initial; // write number of the rows at the end of the buffer; we will need this for furhter multiplications on the other processors
+   Size_send_U = Size_send_U + 2;
+   
+   // now we have the local buffer to send
+   // find the lowest processor column among those who will send me
+   proc_col_min = np_cols; 
+   for(i = 0; i < ratio; i++)
+   {
+      from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols;
+      if(from_where_to_receive_U < proc_col_min)
+         proc_col_min = from_where_to_receive_U;
+   }
+   
+   // do communications and form local buffers for calculations
+   Size_receive_U = 0;       // size of the accumulated buffer
+   cols_in_buffer_U = 0;     // number of columns in the accumulated buffer
+   rows_in_buffer_U = 0;     // number of rows in the accumulated buffer
+   for(i = 0; i < ratio; i++)
+   {
+      where_to_send_U = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;                
+      from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols;
+      
+      // send and receive in the row_comm
+      if(ratio != 1)   // if grid is not square
+      {
+         if(where_to_send_U != my_pcol)   // if I need to send and receive on this step
+         {
+            MPI_Sendrecv(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, Buf_U, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &status);
+            MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U_nowMPI);
+            Size_receive_U_now = (C_INT_TYPE) Size_receive_U_nowMPI;
+            Size_receive_U = Size_receive_U + Size_receive_U_now - 2; // we need only number of elements, so exclude information about cols_in_buffer_U and rows_in_buffer_
+            
+            cols_in_buffer_U_now = Buf_U[Size_receive_U_now - 2];
+            cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now;
+            rows_in_buffer_U_now = Buf_U[Size_receive_U_now - 1];
+            
+            if(rows_in_buffer_U < rows_in_buffer_U_now)
+               rows_in_buffer_U = rows_in_buffer_U_now; 
+
+            intNumber = from_where_to_receive_U/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks  
+            if(proc_col_min >= my_prow)   // if among procs who will send me there is one with the full sets of block-rows in the upper part
+               CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2];  // here I will copy to; formula based on arithm. progression
+            else                         // if among procs who will send me there is one from the lower part of grid
+               if(from_where_to_receive_U < my_prow)   // if I have just received from this processor from the lower part
+                  CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2];  // copy the first block of this processor after the first blocks from the others procs. that will send me later (the first block-column of this proc. is in the lower part of matrix)
+               else
+                  CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2];
+            CopyFrom = Buf_U; 
+         }
+         else  // if I need to send to myself on this step, then I will copy from Buf_to_send_U to Buf_to_receive_U
+         {
+            cols_in_buffer_U_now = cols_in_buffer_U_my_initial;
+            cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now; 
+            
+            rows_in_buffer_U_now = rows_in_buffer_U_my_initial;
+            if(rows_in_buffer_U < rows_in_buffer_U_now)
+               rows_in_buffer_U = rows_in_buffer_U_now; 
+
+            intNumber = my_pcol/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks  
+            if(proc_col_min >= my_prow)   // if among procs who will send me there is one with the full sets of block-rows in the upper part
+               CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2];  // here I will copy to; formula based on arithm. progression
+            else                         // if among procs who will send me there is one from the lower part of grid
+               if(my_pcol < my_prow)   // if I have just received from this processor from the lower part (in this case it is me)
+                  CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2];  // copy the first block of this processor after the first blocks from the others procs. that will send me later (the first block-column of this proc. is in the lower part of matrix)
+               else
+                  CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2];
+            CopyFrom = Buf_to_send_U;  
+            Size_receive_U = Size_receive_U + Size_send_U - 2;
+         }
+            
+         // copy by block-columns
+         intNumber = ceil((math_type)cols_in_buffer_U_now/(math_type)nblk);  // how many block-columns I have received on this iteration
+         if(from_where_to_receive_U >= my_prow)
+            rows_in_block = ceil(((math_type)(from_where_to_receive_U + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;  // number of rows in the first block-column of U buffer
+         else
+            rows_in_block = ratio*nblk; 
+         for(j = 0; j < intNumber; j++)
+         {
+            if((j+1)*nblk < cols_in_buffer_U_now)
+               cols_in_block = nblk; 
+            else
+               cols_in_block = cols_in_buffer_U_now - j*nblk;
+               
+            C_LACPY("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
+
+            CopyFrom = CopyFrom + rows_in_block*cols_in_block; 
+            CopyTo = CopyTo + ratio*rows_in_block*nblk + nblk*nblk*ratio*(ratio-1)/2;  // I need to leave place for ratio block-columns of the other procs. of the lengths rows_in_block, (rows_in_block+nblk), (rows_in_block+2*nblk) and so on
+            rows_in_block = rows_in_block + ratio*nblk;     // number of rows in the next block-columns
+            if(rows_in_block > rows_in_buffer_U_now)
+               rows_in_block = rows_in_buffer_U_now; 
+         }
+      }
+      else    // if grid is square
+      {
+         if(my_prow > 0)
+         {
+            MPI_Sendrecv(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, Buf_to_receive_U, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &status);
+            MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI);
+            Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
+
+            cols_in_buffer_U = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-2];
+            rows_in_buffer_U = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
+         }
+         else    // if my_prow == 0, then I have already everything in my Buf_to_receive_U buffer
+         {
+            Size_receive_U = Size_send_U;
+            rows_in_buffer_U = rows_in_buffer_U_my_initial;
+            cols_in_buffer_U = cols_in_buffer_U_my_initial;
+         }
+      }
+   }
+   if(ratio != 1)
+   {
+      Buf_to_receive_U[Size_receive_U] = cols_in_buffer_U;
+      Buf_to_receive_U[Size_receive_U + 1] = rows_in_buffer_U;
+      Size_receive_U = Size_receive_U + 2;
+   }
+      
+   ////////////////////////////////////////////////////////////// initial reordering of B ///////////////////////////////////////////////////////////////////////////////////////// 
+   
+   if(my_pcol > 0)
+   {
+      where_to_send_B = (my_prow - my_pcol + np_cols)%np_rows;                   // shift = my_pcol
+      from_where_to_receive_B = (my_pcol + my_prow)%np_rows;
+
+      // send and receive in the row_comm
+      if(where_to_send_B != my_prow)                  // for the rectangular proc grids it may be possible that I need to "send to myself"; if it is not the case, then I send
+      {
+         // form array to send
+         C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_send_B, &na_rows);
+         MPI_Sendrecv(Buf_to_send_B, (C_INT_MPI_TYPE) (nb_cols*na_rows), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_B, 0, Buf_to_receive_B, (C_INT_MPI_TYPE) (nb_cols*Buf_rows), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_B, 0, col_comm, &status); 
+         MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_BMPI); // find out how many elements I have received
+         Size_receive_B = (C_INT_TYPE) Size_receive_BMPI;
+         Size_receive_B = Size_receive_B/nb_cols;    // how many rows I have received
+	 
+      }
+      else
+      {
+         C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // else I copy data like I have "received" it
+         Size_receive_B = na_rows;
+      }
+   }
+   else
+   {
+      C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows);        // if I am in the 0 proc row, I need not to send; so copy data like I have "received" it
+      Size_receive_B = na_rows; 
+   }   
+   
+   //////////////////////////////////////////////////////////////////////// main loop ////////////////////////////////////////////////////////////////////////////////
+   where_to_send_U = (my_pcol - 1 + np_cols)%np_cols;
+   from_where_to_receive_U = (my_pcol + 1)%np_cols;
+   where_to_send_B = (my_prow - 1 + np_rows)%np_rows;
+   from_where_to_receive_B = (my_prow + 1)%np_rows;    
+
+   for(i = 1; i < np_rows; i++)
+   {
+      // at this moment I need to send to neighbour what I have in the "received" arrays; that is why change pointers of the "received" and "send" arrays
+      double_ptr = Buf_to_send_U; 
+      Buf_to_send_U = Buf_to_receive_U; 
+      Buf_to_receive_U = double_ptr; 
+      
+      double_ptr = Buf_to_send_B; 
+      Buf_to_send_B = Buf_to_receive_B; 
+      Buf_to_receive_B = double_ptr;
+            
+      Size_send_U = Size_receive_U;
+      Size_send_B = Size_receive_B;                   
+        
+      ///// shift for U ////////////////////////////////////////////////////////////
+      MPI_Isend(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, row_comm, &request_U_Send); 
+      MPI_Irecv(Buf_to_receive_U, (C_INT_MPI_TYPE) (ratio*Size_U_stored), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, row_comm, &request_U_Recv);      
+      ///// shift for B /////////////////////////////////////////////      
+      MPI_Isend(Buf_to_send_B, (C_INT_MPI_TYPE) (Size_send_B*nb_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_B, 0, col_comm, &request_B_Send); 
+      MPI_Irecv(Buf_to_receive_B, (C_INT_MPI_TYPE) (Buf_rows*nb_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_B, 0, col_comm, &request_B_Recv);      
+      ///// multiplication ////////////////////////////////////////////////////////////////////////////////////////////
+      cols_in_buffer_U = (C_INT_TYPE)Buf_to_send_U[Size_receive_U-2];
+      rows_in_buffer_U = (C_INT_TYPE)Buf_to_send_U[Size_receive_U-1];
+      //find minimal proc. column among those procs. who contributed in the current U buffer
+      proc_col_min = np_cols; 
+      for(j = 0; j < ratio; j++)
+      {
+         col_of_origin_U = (my_pcol + my_prow + i - 1 + j*np_rows)%np_cols;
+         if(col_of_origin_U < proc_col_min)
+            proc_col_min = col_of_origin_U;
+      }
+      col_of_origin_U = proc_col_min;
+      
+      num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk); 
+      
+      if (col_of_origin_U >= my_prow)
+         B_local_start = Buf_to_send_B;
+      else 
+         B_local_start = Buf_to_send_B + nblk;
+      
+      U_local_start = Buf_to_send_U;
+      
+      for(j = 0; j < num_of_blocks_in_U_buffer; j++)
+      {
+         curr_rows = (j+1)*nblk;
+         if (curr_rows > rows_in_buffer_U)
+            curr_rows = rows_in_buffer_U; 
+         
+         if((j+1)*nblk <= cols_in_buffer_U)
+            b_rows_mult = nblk; 
+         else
+            b_rows_mult = cols_in_buffer_U - j*nblk;
+         
+         C_GEMM("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows); 
+  
+         U_local_start = U_local_start + nblk*curr_rows; 
+         B_local_start = B_local_start + nblk; 
+      }
+      
+      MPI_Wait(&request_U_Send, &status);
+      MPI_Wait(&request_U_Recv, &status);
+      MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received 
+      Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
+
+      MPI_Wait(&request_B_Send, &status);
+      MPI_Wait(&request_B_Recv, &status);
+      MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_BMPI); // find out how many elements I have received
+      Size_receive_B = (C_INT_TYPE) Size_receive_BMPI;
+      Size_receive_B = (C_INT_TYPE) Size_receive_B / nb_cols;    // how many rows I have received
+
+   }         
+   
+   // last iteration 
+   cols_in_buffer_U = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-2];
+   rows_in_buffer_U = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
+   //find minimal proc. column among those procs. who contributed in the current U buffer
+   proc_col_min = np_cols; 
+   for(j = 0; j < ratio; j++)
+   {
+      col_of_origin_U = (my_pcol + my_prow + np_rows - 1 + j*np_rows)%np_cols;
+      if(col_of_origin_U < proc_col_min)
+         proc_col_min = col_of_origin_U;
+   }
+   col_of_origin_U = proc_col_min;
+      
+   num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
+  
+   if (col_of_origin_U >= my_prow)
+      B_local_start = Buf_to_receive_B;
+   else 
+      B_local_start = Buf_to_receive_B + nblk;
+      
+   U_local_start = Buf_to_receive_U;  
+   
+   for(j = 0; j < num_of_blocks_in_U_buffer; j++)
+   {
+      curr_rows = (j+1)*nblk;
+      if (curr_rows > rows_in_buffer_U)
+         curr_rows = rows_in_buffer_U; 
+      
+      if((j+1)*nblk <= cols_in_buffer_U)
+         b_rows_mult = nblk; 
+      else
+         b_rows_mult = cols_in_buffer_U - j*nblk;
+      
+      C_GEMM("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows); 
+
+      U_local_start = U_local_start + nblk*curr_rows; 
+      B_local_start = B_local_start + nblk;
+   }
+   
+   free(Buf_to_send_U);
+   free(Buf_to_receive_U);
+   free(Buf_to_send_B);
+   free(Buf_to_receive_B);
+   if(ratio != 1)
+      free(Buf_U);
+}
+
+
+void cannons_triang_rectangular_c_impl(math_type* U, math_type* B, int local_rowsCast, int local_colsCast,
+                                    C_INT_TYPE_PTR u_desc, C_INT_TYPE_PTR b_desc, math_type *Res, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm)
+{
+  C_INT_TYPE local_rows, local_cols;
+
+  local_rows = (C_INT_TYPE) local_rowsCast;
+  local_cols = (C_INT_TYPE) local_colsCast;
+
+  MPI_Comm c_row_comm = MPI_Comm_f2c(row_comm);
+  MPI_Comm c_col_comm = MPI_Comm_f2c(col_comm);
+
+  C_INT_TYPE my_prow, my_pcol, np_rows, np_cols;
+  C_INT_MPI_TYPE my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI;
+
+  MPI_Comm_rank(c_row_comm, &my_prowMPI);
+  MPI_Comm_size(c_row_comm, &np_rowsMPI);
+  MPI_Comm_rank(c_col_comm, &my_pcolMPI);
+  MPI_Comm_size(c_col_comm, &np_colsMPI);
+
+  my_prow = (C_INT_TYPE) my_prowMPI;
+  my_pcol = (C_INT_TYPE) my_pcolMPI;
+  np_rows = (C_INT_TYPE) np_rowsMPI;
+  np_cols = (C_INT_TYPE) np_colsMPI;
+
+  // BEWARE
+  // in the cannons algorithm, column and row communicators are exchanged
+  // What we usually call row_comm in elpa, is thus passed to col_comm parameter of the function and vice versa
+  // (order is swapped in the following call)
+  // It is a bit unfortunate, maybe it should be changed in the Cannon algorithm to comply with ELPA standard notation?
+  cannons_triang_rectangular_impl(U, B, np_rows, np_cols, my_prow, my_pcol, u_desc, b_desc, Res, c_col_comm, c_row_comm);
+}
+
diff -Nru elpa-2016.05.001/src/elpa_generalized/cannon.c elpa-2019.11.001/src/elpa_generalized/cannon.c
--- elpa-2016.05.001/src/elpa_generalized/cannon.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_generalized/cannon.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,259 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file has been developed within the ELPA-AEO //
+//    project, which has been a joint effort of
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Technische Universität München, Lehrstuhl für Theoretische Chemie,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/ and
+//    http://elpa-aeo.mpcdf.mpg.de
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Valeriy Manin (Bergische Universität Wuppertal)
+// integreated into the ELPA library Pavel Kus, Andeas Marek (MPCDF)
+
+#include "config-f90.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <complex.h>
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define C_INT_TYPE_PTR long int*
+#define C_INT_TYPE long int
+#define BLAS_KIND c_int64_t
+#else
+#define C_INT_TYPE_PTR int*
+#define C_INT_TYPE int
+#define BLAS_KIND c_int
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define C_INT_MPI_TYPE_PTR long int*
+#define C_INT_MPI_TYPE long int
+#define MPI_KIND c_int64_t
+#else
+#define C_INT_MPI_TYPE_PTR int*
+#define C_INT_MPI_TYPE int
+#define MPI_KIND c_int
+#endif
+
+
+
+// most of the file is not compiled if not using MPI
+#ifdef WITH_MPI
+#include <mpi.h>
+
+
+//***********************************************************************************************************
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "cannon_forw_template.c"
+#include "cannon_back_template.c"
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+/*
+!f> interface
+!f>   subroutine cannons_reduction_d(A, U, local_rowsCast, local_colsCast, a_desc, Res, toStore, row_comm, col_comm) &
+!f>                             bind(C, name="cannons_reduction_c_d")
+!f>     use precision
+!f>     real(c_double)                :: A(local_rowsCast, local_colsCast), U(local_rowsCast, local_colsCast)
+!f>     real(c_double)                :: Res(local_rowsCast, local_colsCast)
+!f>     integer(kind=BLAS_KIND)       :: a_desc(9)
+!f>     integer(kind=c_int),value     :: local_rowsCast, local_colsCast
+!f>     integer(kind=MPI_KIND),value  :: row_comm, col_comm, ToStore
+!f>   end subroutine
+!f> end interface
+
+*/
+void cannons_reduction_c_d(double* A, double* U, int local_rowsCast, int local_colsCast, C_INT_TYPE_PTR a_desc,
+                           double *Res, C_INT_MPI_TYPE ToStore, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm);
+
+/*
+!f> interface
+!f>   subroutine cannons_triang_rectangular_d(U, B, local_rowsCast, local_colsCast, u_desc, b_desc, Res, row_comm, col_comm) &
+!f>                             bind(C, name="cannons_triang_rectangular_c_d")
+!f>     use precision
+!f>     real(c_double)                :: U(local_rowsCast, local_colsCast), B(local_rowsCast, local_colsCast)
+!f>     real(c_double)                :: Res(local_rowsCast, local_colsCast)
+!f>     integer(kind=BLAS_KIND)       :: u_desc(9), b_desc(9)
+!f>     integer(kind=c_int),value     :: local_rowsCast, local_colsCast
+!f>     integer(kind=MPI_KIND),value  :: row_comm, col_comm
+!f>   end subroutine
+!f> end interface
+*/
+void cannons_triang_rectangular_c_d(double* U, double* B, int local_rowsCast, int local_colsCast,
+                                    C_INT_TYPE_PTR u_desc, C_INT_TYPE_PTR b_desc, double *Res, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm);
+
+//***********************************************************************************************************
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "cannon_forw_template.c"
+#include "cannon_back_template.c"
+#undef SINGLE_PRECISION
+#undef REALCASE
+
+/*
+!f> interface
+!f>   subroutine cannons_reduction_f(A, U, local_rowsCast, local_colsCast, a_desc, Res, toStore, row_comm, col_comm) &
+!f>                             bind(C, name="cannons_reduction_c_f")
+!f>     use precision
+!f>     real(c_float)                 :: A(local_rowsCast, local_colsCast), U(local_rowsCast, local_colsCast)
+!f>     real(c_float)                 :: Res(local_rowsCast, local_colsCast)
+!f>     !type(c_ptr), value           :: A, U, Res
+!f>     integer(kind=BLAS_KIND)       :: a_desc(9)
+!f>     integer(kind=c_int),value     :: local_rowsCast, local_colsCast
+!f>     integer(kind=MPI_KIND),value  :: row_comm, col_comm, ToStore
+!f>   end subroutine
+!f> end interface
+*/
+void cannons_reduction_c_f(float* A, float* U, int local_rowsCast, int local_colsCast, C_INT_TYPE_PTR a_desc,
+                           float *Res, C_INT_MPI_TYPE ToStore, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm);
+
+/*
+!f> interface
+!f>   subroutine cannons_triang_rectangular_f(U, B, local_rowsCast, local_colsCast, u_desc, b_desc, Res, row_comm, col_comm) &
+!f>                             bind(C, name="cannons_triang_rectangular_c_f")
+!f>     use precision
+!f>     real(c_float)                        :: U(local_rowsCast, local_colsCast), B(local_rowsCast, local_colsCast)
+!f>     real(c_float)                        :: Res(local_rowsCast, local_colsCast)
+!f>     integer(kind=BLAS_KIND)              :: u_desc(9), b_desc(9)
+!f>     integer(kind=c_int),value            :: local_rowsCast, local_colsCast
+!f>     integer(kind=MPI_KIND),value         :: row_comm, col_comm
+!f>   end subroutine
+!f> end interface
+*/
+void cannons_triang_rectangular_c_f(float* U, float* B, int local_rowsCast, int local_colsCast,
+                                    C_INT_TYPE_PTR u_desc, C_INT_TYPE_PTR b_desc, float *Res, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm);
+
+//***********************************************************************************************************
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "cannon_forw_template.c"
+#include "cannon_back_template.c"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+/*
+!f> interface
+!f>   subroutine cannons_reduction_dc(A, U, local_rowsCast, local_colsCast, a_desc, Res, toStore, row_comm, col_comm) &
+!f>                             bind(C, name="cannons_reduction_c_dc")
+!f>     use precision
+!f>     complex(c_double)             :: A(local_rowsCast, local_colsCast), U(local_rowsCast, local_colsCast)
+!f>     complex(c_double)             :: Res(local_rowsCast, local_colsCast)
+!f>     integer(kind=BLAS_KIND)       :: a_desc(9)
+!f>     integer(kind=c_int),value     :: local_rowsCast, local_colsCast
+!f>     integer(kind=MPI_KIND),value  :: row_comm, col_comm, ToStore
+!f>   end subroutine
+!f> end interface
+*/
+void cannons_reduction_c_dc(double complex* A, double complex* U, int local_rowsCast, int local_colsCasr, C_INT_TYPE_PTR a_desc,
+                            double complex *Res, C_INT_MPI_TYPE ToStore, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm);
+
+/*
+!f> interface
+!f>   subroutine cannons_triang_rectangular_dc(U, B, local_rowsCast, local_colsCast, u_desc, b_desc, Res, row_comm, col_comm) &
+!f>                             bind(C, name="cannons_triang_rectangular_c_dc")
+!f>     use precision
+!f>     complex(c_double)             :: U(local_rowsCast, local_colsCast), B(local_rowsCast, local_colsCast)
+!f>     complex(c_double)             :: Res(local_rowsCast, local_colsCast)
+!f>     integer(kind=BLAS_KIND)       :: u_desc(9), b_desc(9)
+!f>     integer(kind=c_int),value     :: local_rowsCast, local_colsCast
+!f>     integer(kind=MPI_KIND),value  :: row_comm, col_comm
+!f>   end subroutine
+!f> end interface
+*/
+void cannons_triang_rectangular_c_dc(double complex* U, double complex* B, int local_rowsCast, int local_colsCast,
+                                    C_INT_TYPE_PTR u_desc, C_INT_TYPE_PTR b_desc, double complex *Res, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm);
+//***********************************************************************************************************
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../general/precision_macros.h"
+#include "cannon_forw_template.c"
+#include "cannon_back_template.c"
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
+/*
+!f> interface
+!f>   subroutine cannons_reduction_fc(A, U, local_rowsCast, local_colsCast, a_desc, Res, toStore, row_comm, col_comm) &
+!f>                             bind(C, name="cannons_reduction_c_fc")
+!f>     use precision
+!f>     complex(c_float)              :: A(local_rowsCast, local_colsCast), U(local_rowsCast, local_colsCast)
+!f>     complex(c_float)              :: Res(local_rowsCast, local_colsCast)
+!f>     !type(c_ptr), value           :: A, U, Res
+!f>     integer(kind=BLAS_KIND)       :: a_desc(9)
+!f>     integer(kind=c_int),value     :: local_rowsCast, local_colsCast
+!f>     integer(kind=MPI_KIND),value  :: row_comm, col_comm, ToStore
+!f>   end subroutine
+!f> end interface
+*/
+
+void cannons_reduction_c_fc(float complex* A, float complex* U, int local_rowsCast, int local_colsCast, C_INT_TYPE_PTR a_desc,
+                         float complex *Res, C_INT_MPI_TYPE ToStore, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm);
+
+/*
+!f> interface
+!f>   subroutine cannons_triang_rectangular_fc(U, B, local_rowsCast, local_colsCast, u_desc, b_desc, Res, row_comm, col_comm) &
+!f>                             bind(C, name="cannons_triang_rectangular_c_fc")
+!f>     use precision
+!f>     complex(c_float)              :: U(local_rowsCast, local_colsCast), B(local_rowsCast, local_colsCast)
+!f>     complex(c_float)              :: Res(local_rowsCast, local_colsCast)
+!f>     integer(kind=BLAS_KIND)       :: u_desc(9), b_desc(9)
+!f>     integer(kind=c_int),value     :: local_rowsCast, local_colsCast
+!f>     integer(kind=MPI_KIND),value  :: row_comm, col_comm
+!f>   end subroutine
+!f> end interface
+*/
+void cannons_triang_rectangular_c_fc(float complex* U, float complex* B, int local_rowsCast, int local_colsCast,
+                                    C_INT_TYPE_PTR u_desc, C_INT_TYPE_PTR b_desc, float complex *Res, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm);
+#endif
diff -Nru elpa-2016.05.001/src/elpa_generalized/cannon_forw_template.c elpa-2019.11.001/src/elpa_generalized/cannon_forw_template.c
--- elpa-2016.05.001/src/elpa_generalized/cannon_forw_template.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_generalized/cannon_forw_template.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1010 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file has been developed within the ELPA-AEO //
+//    project, which has been a joint effort of
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Technische Universität München, Lehrstuhl für Theoretische Chemie,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/ and
+//    http://elpa-aeo.mpcdf.mpg.de
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Valeriy Manin (Bergische Universität Wuppertal)
+// integreated into the ELPA library Pavel Kus, Andeas Marek (MPCDF)
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define C_INT_TYPE_PTR long int*
+#define C_INT_TYPE long int
+#define BLAS_KIND c_int64_t
+#else
+#define C_INT_TYPE_PTR int*
+#define C_INT_TYPE int
+#define BLAS_KIND c_int
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define C_INT_MPI_TYPE_PTR long int*
+#define C_INT_MPI_TYPE long int
+#define MPI_KIND c_int64_t
+#else
+#define C_INT_MPI_TYPE_PTR int*
+#define C_INT_MPI_TYPE int
+#define MPI_KIND c_int
+#endif
+
+
+// it seems, that we need those two levels of indirection to correctly expand macros
+#define cannons_reduction_impl_expand2(SUFFIX) cannons_reduction_##SUFFIX
+#define cannons_reduction_impl_expand1(SUFFIX) cannons_reduction_impl_expand2(SUFFIX)
+#define cannons_reduction_impl cannons_reduction_impl_expand1(ELPA_IMPL_SUFFIX)
+
+#define cannons_reduction_c_impl_expand2(SUFFIX) cannons_reduction_c_##SUFFIX
+#define cannons_reduction_c_impl_expand1(SUFFIX) cannons_reduction_c_impl_expand2(SUFFIX)
+#define cannons_reduction_c_impl cannons_reduction_c_impl_expand1(ELPA_IMPL_SUFFIX)
+
+#include "../general/precision_typedefs.h"
+
+#include "../helpers/lapack_interfaces.h"
+#include "../helpers/scalapack_interfaces.h"
+
+void cannons_reduction_impl(math_type* A, math_type* U, C_INT_TYPE np_rows, C_INT_TYPE np_cols, C_INT_TYPE my_prow, C_INT_TYPE my_pcol,
+                         C_INT_TYPE_PTR a_desc, math_type *Res, C_INT_MPI_TYPE ToStore, MPI_Comm row_comm, MPI_Comm col_comm)
+{
+   // Input matrices: 
+      // - A: full matrix
+      // - U: upper triangular matrix U(-1)
+   // Output matrix: 
+      // - Res = U(-H)*A*U(-1)
+   // row_comm: communicator along rows
+   // col_comm: communicator along columns
+  
+   C_INT_TYPE na, nblk, i, j, Size_send_A, Size_receive_A, Size_send_U, Size_receive_U, Buf_rows, Buf_cols, where_to_send_A, from_where_to_receive_A, where_to_send_U, from_where_to_receive_U, last_proc_row, last_proc_col, cols_in_buffer_A, rows_in_buffer_A, intNumber;
+   math_type *Buf_to_send_A, *Buf_to_receive_A, *Buf_to_send_U, *Buf_to_receive_U, *data_ptr, *Buf_A, *Buf_pos, *U_local_start, *Res_ptr, *M, *M_T, *A_local_start, *U_local_start_curr, *U_stored, *CopyTo, *CopyFrom, *U_to_calc;
+   C_INT_TYPE ratio, num_of_iters, cols_in_buffer, rows_in_block, rows_in_buffer, curr_col_loc, cols_in_block, curr_col_glob, curr_row_loc, Size_receive_A_now, Nb, owner, cols_in_buffer_A_now;
+   C_INT_MPI_TYPE Size_receive_A_nowMPI, Size_receive_AMPI, Size_receive_UMPI;
+
+   C_INT_TYPE  row_of_origin_U, rows_in_block_U, num_of_blocks_in_U_buffer, k, startPos, cols_in_buffer_U, rows_in_buffer_U, col_of_origin_A, curr_row_loc_res, curr_row_loc_A, curr_col_glob_res; 
+   C_INT_TYPE curr_col_loc_res, curr_col_loc_buf, proc_row_curr, curr_col_loc_U, A_local_index, LDA_A, LDA_A_new, index_row_A_for_LDA, ii, rows_in_block_U_curr, width, row_origin_U, rows_in_block_A, cols_in_buffer_A_my_initial, rows_in_buffer_A_my_initial, proc_col_min;
+   C_INT_TYPE *SizesU;
+   C_INT_TYPE Size_U_skewed, Size_U_stored, Curr_pos_in_U_stored, rows_in_buffer_A_now;
+   math_type done = 1.0;
+   math_type dzero = 0.0;
+   C_INT_TYPE one = 1; 
+   C_INT_TYPE zero = 0; 
+   C_INT_TYPE na_rows, na_cols;
+        
+   MPI_Status status;
+   MPI_Request request_A_Recv; 
+   MPI_Request request_A_Send;
+   MPI_Request request_U_Recv; 
+   MPI_Request request_U_Send;
+      
+   na = a_desc[2];
+   nblk = a_desc[4];
+   na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
+   na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols); 
+   
+   if(ToStore > (np_rows -1))
+      if((my_prow == 0)&&(my_pcol == 0))
+         printf("Buffering level is larger than (np_rows-1) !!!\n");
+   if((my_prow == 0)&&(my_pcol == 0))
+         printf("Buffering level = %d\n", ToStore); 
+   
+//////////////////////////////////////////// Start of algorithm //////////////////////////////////////////////////////////////////////////////
+   if (np_cols%np_rows != 0)
+   {
+      if((my_prow == 0)&& (my_pcol ==0))
+         printf("!!!!! np_cols must be a multiple of np_rows!!!!! I do nothing! \n");
+      return;
+   }
+   if (np_cols < np_rows != 0)
+   {
+      if((my_prow == 0)&& (my_pcol ==0))
+         printf("np_cols < np_rows \n");
+      return;
+   }
+   
+   ratio = np_cols/np_rows; 
+   last_proc_row = ((na-1)/nblk) % np_rows;          // processor row having the last block-row of matrix
+   last_proc_col = ((na-1)/nblk) % np_cols;          // processor column having the last block-column of matrix
+   
+   /////////////////////////memory allocation area//////////////////////////////////////////////////////////////
+   if(na%nblk == 0)
+      if(my_pcol <= last_proc_col)
+         Buf_cols = na_cols;
+      else
+         Buf_cols = na_cols + nblk;      
+   else
+      if(my_pcol < last_proc_col)
+         Buf_cols = na_cols;
+      else if(my_pcol > last_proc_col)
+         Buf_cols = na_cols + nblk; 
+      else  // if my_pcol == last_proc_col
+         Buf_cols = na_cols + nblk - na_cols%nblk;     
+   
+  if(na%nblk == 0)
+      if(my_prow <= last_proc_row)
+         Buf_rows = na_rows;
+      else
+         Buf_rows = na_rows + nblk;      
+   else
+      if(my_prow < last_proc_row)
+         Buf_rows = na_rows;
+      else if(my_prow > last_proc_row)
+         Buf_rows = na_rows + nblk; 
+      else  // if my_prow == last_proc_row
+         Buf_rows = na_rows + nblk - na_rows%nblk;  
+      
+   intNumber = ceil((math_type)na/(math_type)(np_cols*nblk));   // max. possible number of the local block columns of U
+   Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2;   // number of local elements from the upper triangular part that every proc. has (max. possible value among all the procs.)
+   
+   U_stored = malloc((Size_U_stored*(ToStore+1))*sizeof(math_type));
+   SizesU = malloc(ToStore*sizeof(C_INT_TYPE));  // here will be stored the sizes of the buffers of U that I have stored     
+   Buf_to_send_A = malloc(ratio*Buf_cols*Buf_rows*sizeof(math_type));
+   Buf_to_receive_A = malloc(ratio*Buf_cols*Buf_rows*sizeof(math_type));
+   Buf_to_send_U = malloc(Size_U_stored*sizeof(math_type));
+   Buf_to_receive_U = malloc(Size_U_stored*sizeof(math_type));
+   if(ratio != 1)
+      Buf_A = malloc(Buf_cols*Buf_rows*sizeof(math_type));   // in this case we will receive data into initial buffer and after place block-columns to the needed positions of buffer for calculation
+   M = malloc(na_rows*na_cols*sizeof(math_type));
+   M_T = malloc(na_rows*na_cols*sizeof(math_type));
+   for(i = 0; i < na_rows*na_cols; i++)
+      M[i] = 0; 
+        
+   ////////////////////////////////////////////////////////////// initial reordering of A ///////////////////////////////////////////////////////////////////////////////////////// 
+   
+   // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to <ratio> with the "leap" equal to np_rows; the same holds for receive  
+   if(ratio != 1)
+      C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows);   // copy my buffer to send
+   Size_receive_A = 0; 
+   
+   // receive from different processors and place in my buffer for calculation;
+   for(i = 0; i < ratio; i++)
+   {
+      where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;                
+      from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
+      
+      // send and receive in the row_comm
+      if(ratio != 1)   // if grid is not square
+      {
+         if(where_to_send_A != my_pcol)
+         {
+           MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) (na_cols*na_rows), MPI_MATH_DATATYPE_PRECISION_C,(C_INT_MPI_TYPE)  where_to_send_A, 0, Buf_A, (C_INT_MPI_TYPE) (na_rows*Buf_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_A, 0, row_comm, &status);
+           MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
+           Size_receive_A_now = (C_INT_TYPE) Size_receive_A_nowMPI/na_rows;       // how many columns of A I have received
+         }
+         else
+            Size_receive_A_now = na_cols;
+         Size_receive_A = Size_receive_A + Size_receive_A_now;  // here accumulate number of columns of A that I will receive
+
+         // now I need to copy the received block to my buffer for A
+         intNumber = from_where_to_receive_A/np_rows; // how many blocks I will receive, such that I will need to put them before the just received block         
+         
+         CopyTo = &Buf_to_receive_A[intNumber*na_rows*nblk];  // here I will start copying the received buffer
+         if(where_to_send_A != my_pcol)
+            CopyFrom = Buf_A; 
+         else
+            CopyFrom = A;
+         
+         intNumber = ceil((math_type)Size_receive_A_now/(math_type)nblk);   // how many block-columns I have received
+         for(j = 0; j < intNumber; j++)
+         {
+            width = nblk; // width of the current block column
+            if(nblk*(j+1) > Size_receive_A_now)
+               width = Size_receive_A_now - nblk*j; 
+            C_LACPY("A", &na_rows, &width, CopyFrom, &na_rows, CopyTo, &na_rows);
+            CopyTo = CopyTo + na_rows*nblk*ratio; 
+            CopyFrom = CopyFrom + na_rows*nblk; 
+         }
+      }
+      else  // if grid is square then simply receive from one processor to a calculation buffer
+         if(my_prow > 0)
+         {
+            C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_send_A, &na_rows);   // copy my buffer to send
+            MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) (na_cols*na_rows), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_A, 0, Buf_to_receive_A, (C_INT_MPI_TYPE) (na_rows*Buf_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_A, 0, row_comm, &status);
+            MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
+            Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
+         }
+         else
+         {
+            C_LACPY("A", &na_rows, &na_cols, A, &na_rows, Buf_to_receive_A, &na_rows);   // copy A to the received buffer if I do not need to send
+            Size_receive_A = na_cols; 
+         }
+   }
+   
+   ////////////////////////////////////////////////////////////// initial reordering of U //////////////////////////////////////////////////////
+     
+   // form array to send by block-columns
+   num_of_iters = ceil((math_type)na_cols/(math_type)nblk);             // number my of block-columns
+   
+   where_to_send_U = (my_prow - my_pcol + np_cols)%np_rows;                 // shift = my_pcol; we assume that np_cols%np_rows = 0
+   from_where_to_receive_U = (my_pcol + my_prow)%np_rows;
+   
+   if(where_to_send_U == my_prow)    // if I will not need to send my local part of U, then copy my local data to the "received" buffer
+      Buf_pos = Buf_to_receive_U;
+   else
+      Buf_pos = Buf_to_send_U;         // else form the array to send
+   
+   // find the first local block belonging to the upper part of matrix U
+   if(my_pcol >= my_prow)  // if I am in the upper part of proc. grid
+      curr_col_loc = 0;    // my first local block-column has block from the upper part of matrix
+   else
+      curr_col_loc = 1;   //ceil((math_type)(((math_type)my_prow - (math_type)my_pcol)/(math_type)np_cols)) always will give 1 since np_cols > np_rows 
+      
+   num_of_iters = num_of_iters - curr_col_loc;   // I will exclude the first <curr_col_loc> block-columns since they do not have blocks from the upper part of matrix U
+   curr_col_loc = curr_col_loc*nblk;             // local index of the found block-column
+
+   if(my_pcol >= my_prow )
+      rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
+   else
+      rows_in_block = ratio*nblk;
+   
+   Size_send_U = 0; 
+   for(i = 0; i < num_of_iters; i++)       // loop over my block-columns, which have blocks in the upepr part of U
+   {      
+      if(rows_in_block > na_rows)
+         rows_in_block = na_rows; 
+
+      if ((na_cols - curr_col_loc) < nblk)
+         cols_in_block = na_cols - curr_col_loc;     // how many columns do I have in the current block-column
+      else
+         cols_in_block = nblk; 
+      
+      if((rows_in_block > 0)&&(cols_in_block > 0))
+      {
+         data_ptr = &U[curr_col_loc*na_rows];   // pointer to start of the current block-column to be copied to buffer
+         C_LACPY("A", &rows_in_block, &cols_in_block, data_ptr, &na_rows, Buf_pos, &rows_in_block);     // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column 
+         Buf_pos = Buf_pos + rows_in_block*cols_in_block;                         // go to the position where the next block-column will be copied                                             
+         Size_send_U = Size_send_U + rows_in_block*cols_in_block; 
+      }
+      curr_col_loc = curr_col_loc + nblk;      // go to the next local block-column of my local array U 
+      rows_in_block = rows_in_block + ratio*nblk;
+   }
+   rows_in_buffer = rows_in_block - ratio*nblk;    // remove redundant addition from the previous loop 
+   *Buf_pos = (math_type)rows_in_buffer; // write number of the rows at the end of the buffer; we will need this for further multiplications on the other processors
+   Size_send_U = Size_send_U + 1;
+   
+   //send and receive
+   if(where_to_send_U != my_prow)
+   {   
+      // send and receive in the col_comm
+      MPI_Sendrecv(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, Buf_to_receive_U, (C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, col_comm, &status); 
+      MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received
+      Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
+   }
+   else // if I do not need to send 
+      Size_receive_U = Size_send_U;         // how many elements I "have received"; the needed data I have already copied to the "receive" buffer
+      
+   for(i = 0; i < Size_receive_U; i++)
+      U_stored[i] = Buf_to_receive_U[i];
+   Size_U_skewed = Size_receive_U; 
+   Curr_pos_in_U_stored = Size_U_skewed;
+
+   //////////////////////////////////////////////////////////////////////// main loop /////////////////////////////////////////////////////
+   where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
+   from_where_to_receive_A = (my_pcol + 1)%np_cols;
+   where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
+   from_where_to_receive_U = (my_prow + 1)%np_rows;
+   
+   for(j = 1; j < np_rows; j++)
+   {
+      // at this moment I need to send to neighbour what I have in the "received" arrays; that is why exchange pointers of the "received" and "send" arrays
+      data_ptr = Buf_to_send_A; 
+      Buf_to_send_A = Buf_to_receive_A; 
+      Buf_to_receive_A = data_ptr; 
+      
+      data_ptr = Buf_to_send_U; 
+      Buf_to_send_U = Buf_to_receive_U; 
+      Buf_to_receive_U = data_ptr;
+      
+      ///// shift for A ////////////////////////////////////////////////////////////
+      Size_send_A = Size_receive_A;  // number of block-columns of A and block-rows of U to send (that I have received on the previous step) 
+      MPI_Isend(Buf_to_send_A, (C_INT_MPI_TYPE) (Size_send_A*na_rows), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_A, 0, row_comm, &request_A_Send); 
+      MPI_Irecv(Buf_to_receive_A, (C_INT_MPI_TYPE) (Buf_cols*na_rows*ratio), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_A, 0, row_comm, &request_A_Recv);
+         
+      ///// shift for U /////////////////////////////////////////////
+      Size_send_U = Size_receive_U; 
+      MPI_Isend(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, col_comm, &request_U_Send); 
+      MPI_Irecv(Buf_to_receive_U, (C_INT_MPI_TYPE) (Buf_rows*na_cols), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, col_comm, &request_U_Recv); 
+      
+      ///// multiplication ////////////////////////////////////////////////////////////////////////////////////////////
+      rows_in_buffer = (int)Buf_to_send_U[Size_receive_U-1];
+      row_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
+      
+      if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U))   // if I and sender are from the upper part of grid
+      {
+         cols_in_buffer = na_cols;                          // then we have the same number of columns in the upper triangular part
+         curr_col_loc_res = 0;                              // all my block-columns have parts in the upper triangular part
+         curr_col_loc_buf = 0;                              // I use all the block-columns of the received buffer
+      }
+      if((my_pcol < my_prow)&&(my_pcol < row_origin_U))     // if I and sender are from the lower part of grid
+      {
+         cols_in_buffer = na_cols - nblk;                   // then we have the same number of columns in the upper triangular part, but the first block-column was not included
+         curr_col_loc_res = nblk;                           // I start update from the second block-column since the first on is in the lower triangular part
+         curr_col_loc_buf = 0;                              // I use all the block-columns of the received buffer
+      }
+      if((my_pcol >= my_prow)&&(my_pcol < row_origin_U))    // if I am from the upper part of grid and sender is from the lower part
+      {
+         cols_in_buffer = na_cols - nblk;                   // then I have received one block-column less than I have
+         curr_col_loc_res = nblk;                           // all my block-columns have parts in the upper triangular part, but the first block-column of the received buffers corresponds to my second one
+         curr_col_loc_buf = 0;                              // I use all the block-columns of the received buffer
+      }
+      if((my_pcol < my_prow)&&(my_pcol >= row_origin_U))    // if I am from the lower part of grid and sender is from the upper part
+      {
+         cols_in_buffer = na_cols;                          // then I have received the full set of block-columns
+         curr_col_loc_res = nblk;                           // I start update from the second block-column since the first on is in the lower triangular part
+         curr_col_loc_buf = nblk;                           // I skip the first block-column of the buffer, since my first block-column is in the lower part
+      }
+    
+      num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk); 
+      
+      startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
+      U_local_start = &Buf_to_send_U[startPos];
+      Res_ptr = &M[curr_col_loc_res*na_rows];
+  
+      for (i = 0; i < num_of_blocks_in_U_buffer; i++)
+      { 
+         curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
+         proc_row_curr = (curr_col_glob/nblk)%np_rows; 
+         rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk;     // in A; not to go down beyond  the upper triangular part
+         if(my_prow <= proc_row_curr)
+            rows_in_block_A = rows_in_block_A + nblk; 
+         
+         if(rows_in_block_A > na_rows)
+            rows_in_block_A = na_rows; 
+      
+         if((curr_col_loc_buf + nblk) <= cols_in_buffer)
+            cols_in_block = nblk;      // number columns in block of U which will take part in this calculation
+         else
+            cols_in_block = cols_in_buffer - curr_col_loc_buf; 
+      
+         rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk;    // corresponds to columns in A;
+         if(proc_row_curr >= row_origin_U)
+            rows_in_block_U = rows_in_block_U + nblk; 
+         
+         if(rows_in_block_U > rows_in_buffer)
+            rows_in_block_U = rows_in_buffer;
+
+         if ((rows_in_block_A > 0)&&(cols_in_block > 0))
+            if (j == 1) {
+               C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &done, Buf_to_send_A, &na_rows, U_local_start, &rows_in_block_U, &dzero, Res_ptr, &na_rows);
+	    }
+            else { 
+               C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &done, Buf_to_send_A, &na_rows, U_local_start, &rows_in_block_U, &done, Res_ptr, &na_rows);
+	    }
+      
+         U_local_start = U_local_start + rows_in_block_U*cols_in_block;
+         curr_col_loc_res = curr_col_loc_res + nblk;
+         Res_ptr = &M[curr_col_loc_res*na_rows];
+         curr_col_loc_buf = curr_col_loc_buf + nblk;  
+      } 
+     
+      MPI_Wait(&request_A_Send, &status);
+      MPI_Wait(&request_A_Recv, &status);
+
+      MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI); // find out how many elements I have received
+      Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
+      Size_receive_A = Size_receive_A / na_rows;
+      
+      
+      MPI_Wait(&request_U_Send, &status);
+      MPI_Wait(&request_U_Recv, &status);
+      MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received  
+      Size_receive_U = (C_INT_TYPE) Size_receive_UMPI; 
+       //// write in the buffer for later use //////////////////////////////7
+      if(j <= ToStore)
+      {
+         for(k = 0; k < Size_receive_U; k++)
+            U_stored[Curr_pos_in_U_stored + k] = Buf_to_receive_U[k]; 
+         Curr_pos_in_U_stored = Curr_pos_in_U_stored + Size_receive_U; 
+         SizesU[j-1] = Size_receive_U; 
+      }
+   }
+   
+   /////// do the last multiplication //////////////
+   rows_in_buffer = (C_INT_TYPE)Buf_to_receive_U[Size_receive_U-1];
+   row_origin_U = (my_pcol + my_prow + np_cols + np_rows -1)%np_rows;
+
+   if((my_pcol >= my_prow)&&(my_pcol >= row_origin_U))   // if I and sender are from the upper part of grid
+   {
+      cols_in_buffer = na_cols;                          // then we have the same number of columns in the upper triangular part
+      curr_col_loc_res = 0;                              // all my block-columns have parts in the upper triangular part
+      curr_col_loc_buf = 0;                              // I use all the block-columns of the received buffer
+   }
+   if((my_pcol < my_prow)&&(my_pcol < row_origin_U))     // if I and sender are from the lower part of grid
+   {
+      cols_in_buffer = na_cols - nblk;                   // then we have the same number of columns in the upper triangular part, but the first block-column was not included
+      curr_col_loc_res = nblk;                           // I start update from the second block-column since the first on is in the lower triangular part
+      curr_col_loc_buf = 0;                              // I use all the block-columns of the received buffer
+   }
+   if((my_pcol >= my_prow)&&(my_pcol < row_origin_U))    // if I am from the upper part of grid and sender is from the lower part
+   {
+      cols_in_buffer = na_cols - nblk;                   // then I have received one block-column less than I have
+      curr_col_loc_res = nblk;                           // all my block-columns have parts in the upper triangular part, but the first block-column of the received buffers corresponds to my second one
+      curr_col_loc_buf = 0;                              // I use all the block-columns of the received buffer
+   }
+   if((my_pcol < my_prow)&&(my_pcol >= row_origin_U))    // if I am from the lower part of grid and sender is from the upper part
+   {
+      cols_in_buffer = na_cols;                          // then I have received the full set of block-columns
+      curr_col_loc_res = nblk;                           // I start update from the second block-column since the first on is in the lower triangular part
+      curr_col_loc_buf = nblk;                           // I skip the first block-column of the buffer, since my first block-column is in the lower part
+   }
+    
+   num_of_blocks_in_U_buffer = ceil(((math_type)cols_in_buffer - (math_type)curr_col_loc_buf)/(math_type)nblk); 
+      
+   startPos = (curr_col_loc_buf + nblk)*curr_col_loc_buf/2;
+   U_local_start = &Buf_to_receive_U[startPos];
+   Res_ptr = &M[curr_col_loc_res*na_rows];
+  
+   for (i = 0; i < num_of_blocks_in_U_buffer; i++)
+   { 
+      curr_col_glob = (curr_col_loc_res/nblk)*nblk*np_cols + my_pcol*nblk;
+      proc_row_curr = (curr_col_glob/nblk)%np_rows; 
+      rows_in_block_A = (curr_col_glob/(nblk*np_rows))*nblk;     // in A; not to go down beyond  the upper triangular part
+      if(my_prow <= proc_row_curr)
+         rows_in_block_A = rows_in_block_A + nblk; 
+         
+      if(rows_in_block_A > na_rows)
+         rows_in_block_A = na_rows; 
+      
+      if((curr_col_loc_buf + nblk) <= cols_in_buffer)
+         cols_in_block = nblk;      // number columns in block of U which will take part in this calculation
+      else
+         cols_in_block = cols_in_buffer - curr_col_loc_buf; 
+      
+      rows_in_block_U = (curr_col_glob/(nblk*np_rows))*nblk;    // corresponds to columns in A;
+      if(proc_row_curr >= row_origin_U)
+         rows_in_block_U = rows_in_block_U + nblk; 
+        
+      if(rows_in_block_U > rows_in_buffer)
+         rows_in_block_U = rows_in_buffer; 
+
+      if ((rows_in_block_A > 0)&&(cols_in_block > 0))
+         if (j == 1) {
+            C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &done, Buf_to_receive_A, &na_rows, U_local_start, &rows_in_block_U, &dzero, Res_ptr, &na_rows);
+	 }
+         else { 
+            C_GEMM("N", "N", &rows_in_block_A, &cols_in_block, &rows_in_block_U, &done, Buf_to_receive_A, &na_rows, U_local_start, &rows_in_block_U, &done, Res_ptr, &na_rows);
+         }
+      
+      U_local_start = U_local_start + rows_in_block_U*cols_in_block;
+      curr_col_loc_res = curr_col_loc_res + nblk;
+      Res_ptr = &M[curr_col_loc_res*na_rows];
+      curr_col_loc_buf = curr_col_loc_buf + nblk;  
+   }  
+   
+   ///////////////////// Now M has an upper part of A*U(-1) ///////////////////////////////////////////////
+   
+   C_PTRAN(&na, &na, &done, M, &one, &one, a_desc, &dzero, M_T, &one, &one, a_desc);     // now M_T has lower part of U(-H)*A 
+ 
+   ////////////////////////////////////////////////// start algorithm to find lower part of U(-H)*A*U(-1) //////////////////////////
+           
+   /////////////////////////////////////////////////////////////// initial reordering of A ////////////////////////////////////////////////
+   
+   // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to <ratio> with the "leap" equal to np_rows; the same holds for receive  
+   if((ratio != 1)||(my_prow != 0))   // if grid is rectangular or my_prow is not 0
+      Buf_pos = Buf_to_send_A;     // I will copy to the send buffer
+   else
+      Buf_pos = Buf_to_receive_A;  // if grid is square and my_prow is 0, then I will copy to the received buffer
+   
+   // form array to send by block-columns; we need only lower triangular part
+   num_of_iters = ceil((math_type)na_cols/(math_type)nblk);             // number my of block-columns
+   
+   cols_in_buffer_A_my_initial = 0;
+   Size_send_A = 0; 
+   
+   if(my_pcol <= my_prow)  // if I am from the lower part of grid
+   {
+      curr_row_loc = 0;     // I will copy all my block-rows
+      rows_in_buffer_A_my_initial = na_rows;
+   }
+   else
+   {
+      curr_row_loc = ceil((math_type)(((math_type)my_pcol - (math_type)my_prow)/(math_type)np_rows))*nblk; // I will skip some of my block-rows
+      rows_in_buffer_A_my_initial = na_rows - curr_row_loc;   
+   }
+       
+   for(i = 0; i < num_of_iters; i++)       // loop over my block-columns
+   {
+      curr_col_loc = i*nblk;      // local index of start of the current block-column 
+      rows_in_block = na_rows - curr_row_loc;    // how many rows do I have in the lower part of the current block-column
+      
+      if ((na_cols - curr_col_loc) < nblk)
+         cols_in_block = na_cols - curr_col_loc;     // how many columns do I have in the block-column
+      else
+         cols_in_block = nblk; 
+      
+      if((rows_in_block > 0)&&(cols_in_block > 0))
+      {
+         A_local_start = &M_T[curr_col_loc*na_rows + curr_row_loc];
+         C_LACPY("A", &rows_in_block, &cols_in_block, A_local_start, &na_rows, Buf_pos, &rows_in_block);     // copy lower part of block-column in the buffer with LDA = length of the lower part of block-column 
+         Buf_pos = Buf_pos + rows_in_block*cols_in_block;
+         Size_send_A = Size_send_A + rows_in_block*cols_in_block; 
+         cols_in_buffer_A_my_initial = cols_in_buffer_A_my_initial + cols_in_block; 
+      }
+      curr_row_loc = curr_row_loc + ratio*nblk;
+   }
+   *Buf_pos = (math_type)cols_in_buffer_A_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors
+   Size_send_A = Size_send_A + 1;
+   
+   // now we have the local buffer to send
+   // find the lowest processor column among those who will send me
+   proc_col_min = np_cols; 
+   for(i = 0; i < ratio; i++)
+   {
+      from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
+      if(from_where_to_receive_A < proc_col_min)
+         proc_col_min = from_where_to_receive_A;
+   }
+   // do communications and form local buffers for calculations
+   Size_receive_A = 0;       // size of the accumulated buffer
+   cols_in_buffer_A = 0;     // number of columns in the accumulated buffer
+   rows_in_buffer_A = 0;     // number of rows in the accumulated buffer
+   for(i = 0; i < ratio; i++)
+   {
+      where_to_send_A = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;                
+      from_where_to_receive_A = (my_pcol + my_prow + i*np_rows)%np_cols;
+      
+      // send and receive in the row_comm
+      if(ratio != 1)   // if grid is not square
+      {
+         if(where_to_send_A != my_pcol)   // if I need to send and receive on this step
+         {
+            MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_A, 0, Buf_A, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_A, 0, row_comm, &status);
+            MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_A_nowMPI);
+            Size_receive_A_now = (C_INT_TYPE) Size_receive_A_nowMPI;
+
+            Size_receive_A = Size_receive_A + Size_receive_A_now - 1; // we need only number of elements, so exclude information about cols_in_buffer_A
+
+            cols_in_buffer_A_now = Buf_A[Size_receive_A_now-1];
+            cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now; 
+            
+            // determine number of rows in the received buffer
+            if(from_where_to_receive_A <= my_prow)  // if source is from the lower part of grid
+            {
+               rows_in_buffer_A_now = na_rows;
+            }
+            else
+            {
+               rows_in_buffer_A_now = na_rows - ceil((math_type)(((math_type)from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk; // some of the block-rows have been skipped
+            }
+            if(rows_in_buffer_A < rows_in_buffer_A_now)
+               rows_in_buffer_A = rows_in_buffer_A_now; 
+
+            intNumber = from_where_to_receive_A/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks  
+            if(proc_col_min <= my_prow)   // if among procs who will send me there is one with the full sets of block-rows in the lower part
+               CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)];  // here I will copy to; formula based on arithm. progression
+            else
+               CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)];  // otherwise, the first block-column will be shorter by one block
+            CopyFrom = Buf_A; 
+         }
+         else  // if I need to send to myself on this step, then I will copy from Buf_to_send_L to Buf_to_receive_A
+         {
+            cols_in_buffer_A_now = cols_in_buffer_A_my_initial;
+            cols_in_buffer_A = cols_in_buffer_A + cols_in_buffer_A_now; 
+            
+            rows_in_buffer_A_now = rows_in_buffer_A_my_initial;
+            if(rows_in_buffer_A < rows_in_buffer_A_now)
+               rows_in_buffer_A = rows_in_buffer_A_now; 
+
+            intNumber = my_pcol/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks  
+            if(proc_col_min <= my_prow)   // if among procs who will send me there is one with the full sets of block-rows in the lower part
+               CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)];  // here I will copy to; formula based on arithm. progression
+            else
+               CopyTo = &Buf_to_receive_A[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)];  // otherwise, the first block-column will be shorter by one block
+            CopyFrom = Buf_to_send_A;  
+
+            Size_receive_A = Size_receive_A + Size_send_A - 1;
+         }
+            
+         // copy by block-columns
+         intNumber = ceil((math_type)cols_in_buffer_A_now/(math_type)nblk);  // how many block-columns I have received on this iteration
+         rows_in_block = rows_in_buffer_A_now; 
+         for(j = 0; j < intNumber; j++)
+         {
+            if((j+1)*nblk < cols_in_buffer_A_now)
+               cols_in_block = nblk; 
+            else
+               cols_in_block = cols_in_buffer_A_now - j*nblk;
+               
+            C_LACPY("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
+
+            CopyFrom = CopyFrom + rows_in_block*cols_in_block; 
+            CopyTo = CopyTo + nblk*(ratio*rows_in_block - nblk*(ratio-1)*ratio/2);  // I need to leave place for ratio block-columns of the other procs. of the lengths rows_in_block, (rows_in_block-nblk), (rows_in_block-2*nblk) and so on
+            rows_in_block = rows_in_block - ratio*nblk;     // number of rows in the next block-columns
+         }
+      }
+      else    // if grid is square
+      {
+         if(my_prow > 0)
+         {
+            MPI_Sendrecv(Buf_to_send_A, (C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_A, 0, Buf_to_receive_A, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_A, 0, row_comm, &status);
+            MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI);
+            Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
+
+            cols_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
+            if(from_where_to_receive_A <= my_prow)  // if source is from the lower part of grid
+            {
+               rows_in_buffer_A = na_rows;
+            }
+            else
+            {
+               rows_in_buffer_A = na_rows - ceil((math_type)(((math_type)from_where_to_receive_A - (math_type)my_prow)/(math_type)np_rows))*nblk; // some of the block-rows have been skipped
+            }
+         }
+         else    // if my_prow == 0, then I have already everything in my Buf_to_receive_A buffer
+         {
+            Size_receive_A = Size_send_A;
+            rows_in_buffer_A = rows_in_buffer_A_my_initial;
+            cols_in_buffer_A = cols_in_buffer_A_my_initial;
+         }
+      }
+   }
+   if(ratio != 1)
+   {
+      Buf_to_receive_A[Size_receive_A] = cols_in_buffer_A;
+      Buf_to_receive_A[Size_receive_A + 1] = rows_in_buffer_A;
+      Size_receive_A = Size_receive_A + 2;
+   }
+   else
+   {
+      Buf_to_receive_A[Size_receive_A] = rows_in_buffer_A;
+      Size_receive_A = Size_receive_A + 1;
+   }
+
+   ////////////////////////////////////////////////////////////// initial reordering of U: restore skewed U from the first multiplication ///////////////////////////
+   
+   Size_receive_U = Size_U_skewed;
+   U_to_calc = U_stored;
+   
+   //////////////////////////////////////////////////////////////////////// main loop ////////////////////////////////////////////////////////////////////////////////
+   
+   where_to_send_A = (my_pcol - 1 + np_cols)%np_cols;
+   from_where_to_receive_A = (my_pcol + 1)%np_cols;
+   where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
+   from_where_to_receive_U = (my_prow + 1)%np_rows;
+   Curr_pos_in_U_stored = Size_U_skewed;
+  
+   for(j = 1; j < np_rows; j++)
+   {
+      // at this moment I need to send to neighbour what I have in the "received" arrays; that is why exchange pointers of the "received" and "send" arrays
+      data_ptr = Buf_to_send_A; 
+      Buf_to_send_A = Buf_to_receive_A; 
+      Buf_to_receive_A = data_ptr; 
+      
+      if (j > ToStore)
+      {
+         data_ptr = Buf_to_send_U; 
+         Buf_to_send_U = Buf_to_receive_U; 
+         Buf_to_receive_U = data_ptr;
+      }
+        
+      ///// shift for A ////////////////////////////////////////////////////////////
+      Size_send_A = Size_receive_A; 
+      MPI_Isend(Buf_to_send_A, (C_INT_MPI_TYPE) Size_send_A, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_A, 0, row_comm, &request_A_Send); 
+      MPI_Irecv(Buf_to_receive_A, (C_INT_MPI_TYPE) (ratio*Size_U_stored), MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_A, 0, row_comm, &request_A_Recv);
+         
+      ///// shift for U /////////////////////////////////////////////
+      Size_send_U = Size_receive_U; 
+      if (j > ToStore)
+      {
+         if(j > ToStore + 1)
+         {
+            MPI_Isend(Buf_to_send_U, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, col_comm, &request_U_Send);
+            U_to_calc = Buf_to_send_U;
+         }
+         else {
+	    MPI_Isend(U_to_calc, (C_INT_MPI_TYPE) Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) where_to_send_U, 0, col_comm, &request_U_Send);
+	 }
+         MPI_Irecv(Buf_to_receive_U, (C_INT_MPI_TYPE) Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, (C_INT_MPI_TYPE) from_where_to_receive_U, 0, col_comm, &request_U_Recv);	 
+      }
+      
+      ///// multiplication ////////////////////////////////////////////////////////////////////////////////////////////
+      rows_in_buffer_U = (C_INT_TYPE)U_to_calc[Size_receive_U-1];
+      row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
+      if(my_pcol >= row_of_origin_U)
+         cols_in_buffer_U = na_cols;
+      else
+         cols_in_buffer_U = na_cols - nblk;
+      
+      cols_in_buffer_A = (C_INT_TYPE)Buf_to_send_A[Size_receive_A-2];
+      rows_in_buffer_A = (C_INT_TYPE)Buf_to_send_A[Size_receive_A-1];
+      // find the minimal pcol among those who have sent A for this iteration
+      col_of_origin_A = np_cols; 
+      for(i = 0; i < ratio; i++)
+      {
+         intNumber = (my_pcol + my_prow + i*np_rows + np_cols + j - 1)%np_cols;
+         if(intNumber < col_of_origin_A)
+            col_of_origin_A = intNumber;
+      }
+      
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // find block-column of the result to start update with
+      if (my_pcol >= row_of_origin_U)   // if origin of U is from the upper part 
+         curr_col_loc_res = 0;          // then I update all columns of Result    
+      else
+         curr_col_loc_res = nblk;       // the first block column of U corresponds to my second one and I do not need to update the first block-column
+      
+      num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk)); 
+      if(my_pcol >= row_of_origin_U)    // if origin of U is from the upper part
+         rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk;  // blocks in the first block-column of U buffer
+      else
+         rows_in_block_U = ratio*nblk;
+      
+      U_local_start = U_to_calc;
+      
+      for (i = 0; i < num_of_blocks_in_U_buffer; i++)
+      { 
+         // find block-row of the result to start update with; we need to update only lower triangular part of result
+         curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk;   // global index of the first column to be updated
+         // now we need to find the smallest my local row index, such that the corresponding global index is larger of equal to <curr_col_glob_res>
+         Nb = curr_col_glob_res/nblk;    // how many global block-rows are before the needed one
+         owner = Nb%np_rows;             // proc. row index of the owner of row with the global index equal to <curr_col_glob_res> (it is not necessarily me)
+         curr_row_loc_res = (Nb/np_rows)*nblk; 
+         if(my_prow < owner)
+            curr_row_loc_res = curr_row_loc_res + nblk; 
+      
+         curr_row_loc_A = curr_row_loc_res;     // it is impossible, that both col_of_origin_L and row_of_origin_U are from upper part
+         if(col_of_origin_A > my_prow)
+            curr_row_loc_A = curr_row_loc_A - nblk;  
+        
+         rows_in_block = rows_in_buffer_A - curr_row_loc_A;    // rows in current block of A
+              
+         curr_col_loc_U = i*nblk;   // local index in the buffer U of the current column
+      
+         if((curr_col_loc_U + nblk) <= cols_in_buffer_U)
+            cols_in_block = nblk;      // number columns in block of U which will take part in this calculation
+         else
+            cols_in_block = cols_in_buffer_U - curr_col_loc_U; 
+      
+         if(rows_in_block_U > rows_in_buffer_U)
+            rows_in_block_U = rows_in_buffer_U;     // rows in current column of U; also a leading dimension for U
+ 
+         A_local_index = curr_row_loc_A;
+         A_local_start = &Buf_to_send_A[A_local_index];
+         Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
+
+         LDA_A = rows_in_buffer_A;
+         LDA_A_new = LDA_A;
+         if ((rows_in_block > 0)&&(cols_in_block > 0))
+         {
+            U_local_start_curr = U_local_start; 
+ 
+            // loop over block-columns of the "active" part of L buffer
+            for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
+            {
+               if((ii+1)*nblk <= cols_in_buffer_A)
+                  rows_in_block_U_curr = nblk; 
+               else
+                  rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;  
+
+               if((j == 1)&&(ii == 0)) {
+                  C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &done, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dzero, Res_ptr, &na_rows); 
+	       }
+               else { 
+                  C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &done, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &done, Res_ptr, &na_rows);
+               }
+
+               LDA_A_new = LDA_A_new - nblk;
+      
+               U_local_start_curr = U_local_start_curr + rows_in_block_U_curr; 
+               A_local_index = A_local_index - LDA_A + LDA_A*nblk + LDA_A_new; 
+               A_local_start = &Buf_to_send_A[A_local_index];
+               LDA_A = LDA_A_new; 
+            }
+         }
+      
+         U_local_start = U_local_start + rows_in_block_U*cols_in_block;
+         curr_col_loc_res = curr_col_loc_res + nblk; 
+         rows_in_block_U = rows_in_block_U + ratio*nblk;
+      }    
+      
+      MPI_Wait(&request_A_Send, &status);
+      MPI_Wait(&request_A_Recv, &status);
+      MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_AMPI); // find out how many elements I have received 
+      Size_receive_A = (C_INT_TYPE) Size_receive_AMPI;
+      
+      if (j <= ToStore)
+      {
+         U_to_calc = &U_stored[Curr_pos_in_U_stored];
+         Curr_pos_in_U_stored = Curr_pos_in_U_stored + SizesU[j-1]; 
+         Size_receive_U =  SizesU[j-1];
+      }
+      else
+      {
+         MPI_Wait(&request_U_Send, &status);
+         MPI_Wait(&request_U_Recv, &status);
+	 MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_UMPI); // find out how many elements I have received 
+         Size_receive_U = (C_INT_TYPE) Size_receive_UMPI;
+      }
+   }
+   
+   /////// do the last multiplication //////////////
+   if(ToStore < np_rows - 1)
+      U_to_calc = Buf_to_receive_U;
+   rows_in_buffer_U = (C_INT_TYPE)U_to_calc[Size_receive_U-1];
+   row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;     
+   if(my_pcol >= row_of_origin_U)
+      cols_in_buffer_U = na_cols;
+   else
+      cols_in_buffer_U = na_cols - nblk;
+      
+   cols_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-2];
+   rows_in_buffer_A = (C_INT_TYPE)Buf_to_receive_A[Size_receive_A-1];
+   // find the minimal pcol among those who have sent A for this iteration
+   col_of_origin_A = np_cols; 
+   for(i = 0; i < ratio; i++)
+   {
+      intNumber = (my_pcol + my_prow + i*np_rows + np_cols + np_rows - 1)%np_cols;
+      if(intNumber < col_of_origin_A)
+         col_of_origin_A = intNumber;
+   }
+   
+   // find block-column of the result to start update with
+   if (my_pcol >= row_of_origin_U)   // if origin of U is from the upper part 
+      curr_col_loc_res = 0;          // then I update all columns of Result    
+   else
+      curr_col_loc_res = nblk;       // the first block column of U corresponds to my second one and I do not need to update the first block-column
+      
+   num_of_blocks_in_U_buffer = ceil((math_type)((math_type)cols_in_buffer_U/(math_type)nblk));
+   if(my_pcol >= row_of_origin_U)    // if origin of U is from the upper part
+      rows_in_block_U = ceil(((math_type)(my_pcol + 1) - (math_type)row_of_origin_U)/(math_type)np_rows)*nblk;  // blocks in the first block-column of U buffer
+   else
+      rows_in_block_U = ratio*nblk;
+      
+   U_local_start = U_to_calc;
+      
+   for (i = 0; i < num_of_blocks_in_U_buffer; i++)
+   { 
+      // find block-row of the result to start update with; we need to update only lower triangular part of result
+      curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk;   // global index of the first column to be updated
+      // now we need to find the smallest my local row index, such that the corresponding global index is larger of equal to <curr_col_glob_res>
+      Nb = curr_col_glob_res/nblk;    // how many global block-rows are before the needed one
+      owner = Nb%np_rows;             // proc. row index of the owner of row with the global index equal to <curr_col_glob_res> (it is not necessarily me)
+      curr_row_loc_res = (Nb/np_rows)*nblk; 
+      if(my_prow < owner)
+         curr_row_loc_res = curr_row_loc_res + nblk; 
+      
+      curr_row_loc_A = curr_row_loc_res;     // it is impossible, that both col_of_origin_L and row_of_origin_U are from upper part
+      if(col_of_origin_A > my_prow)
+         curr_row_loc_A = curr_row_loc_A - nblk;
+      
+      rows_in_block = rows_in_buffer_A - curr_row_loc_A;    //rows in current block of  
+              
+      curr_col_loc_U = i*nblk;   // local index in the buffer U of the current column
+      
+      if((curr_col_loc_U + nblk) <= cols_in_buffer_U)
+         cols_in_block = nblk;      // number columns in block of U which will take part in this calculation
+      else
+         cols_in_block = cols_in_buffer_U - curr_col_loc_U; 
+      
+      if(rows_in_block_U > rows_in_buffer_U)
+         rows_in_block_U = rows_in_buffer_U; 
+ 
+      A_local_index = curr_row_loc_A;
+      A_local_start = &Buf_to_receive_A[A_local_index];
+      Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
+      LDA_A = rows_in_buffer_A; 
+      LDA_A_new = LDA_A; 
+      if ((rows_in_block > 0) &&(cols_in_block > 0))
+      {
+         U_local_start_curr = U_local_start; 
+
+         // loop over block-columns of the "active" part of L buffer
+         for (ii = 0; ii < ceil((math_type)rows_in_block_U/(math_type)nblk); ii++)
+         {
+            if((ii+1)*nblk <= cols_in_buffer_A)
+               rows_in_block_U_curr = nblk; 
+            else
+               rows_in_block_U_curr = cols_in_buffer_A - ii*nblk;  
+
+            if((j == 1)&&(ii == 0)) {
+               C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &done, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &dzero, Res_ptr, &na_rows); 
+	    }
+            else { 
+               C_GEMM("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &done, A_local_start, &LDA_A, U_local_start_curr, &rows_in_block_U, &done, Res_ptr, &na_rows);
+	    }
+
+            LDA_A_new = LDA_A_new - nblk;
+              
+            U_local_start_curr = U_local_start_curr + rows_in_block_U_curr; 
+            A_local_index = A_local_index - (LDA_A - rows_in_block) + LDA_A*nblk + LDA_A_new - rows_in_block; 
+            A_local_start = &Buf_to_receive_A[A_local_index];
+            LDA_A = LDA_A_new;
+         }
+      }
+      
+      U_local_start = U_local_start + rows_in_block_U*cols_in_block;
+      curr_col_loc_res = curr_col_loc_res + nblk; 
+      rows_in_block_U = rows_in_block_U + ratio*nblk;
+   }
+   
+   C_PTRAN(&na, &na, &done, Res, &one, &one, a_desc, &dzero, M, &one, &one, a_desc);
+   C_PLACPY("U", &na, &na, M, &one, &one, a_desc, Res, &one, &one, a_desc);
+      
+
+   free(Buf_to_send_A);
+   free(Buf_to_receive_A);
+   free(Buf_to_send_U);
+   free(Buf_to_receive_U);
+   free(M); 
+   free(M_T);
+   if(ratio != 1)
+      free(Buf_A);
+   free(U_stored);
+   free(SizesU);
+}
+
+void cannons_reduction_c_impl(math_type* A, math_type* U, int local_rowsCast, int local_colsCast,
+                         C_INT_TYPE_PTR a_desc, math_type *Res, C_INT_MPI_TYPE ToStore, C_INT_MPI_TYPE row_comm, C_INT_MPI_TYPE col_comm)
+{
+  C_INT_TYPE local_rows, local_cols;
+  local_rows = (C_INT_TYPE) local_rowsCast;
+  local_cols = (C_INT_TYPE) local_colsCast;
+
+  MPI_Comm c_row_comm = MPI_Comm_f2c(row_comm);
+  MPI_Comm c_col_comm = MPI_Comm_f2c(col_comm);
+
+
+  C_INT_MPI_TYPE my_prow, my_pcol, np_rows, np_cols;
+  C_INT_MPI_TYPE my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI;
+
+  MPI_Comm_rank(c_row_comm, &my_prowMPI);
+  MPI_Comm_size(c_row_comm, &np_rowsMPI);
+  MPI_Comm_rank(c_col_comm, &my_pcolMPI);
+  MPI_Comm_size(c_col_comm, &np_colsMPI);
+
+  my_prow = (C_INT_TYPE) my_prowMPI;
+  my_pcol = (C_INT_TYPE) my_pcolMPI;
+  np_rows = (C_INT_TYPE) np_rowsMPI;
+  np_cols = (C_INT_TYPE) np_colsMPI;
+
+  // BEWARE
+  // in the cannons algorithm, column and row communicators are exchanged
+  // What we usually call row_comm in elpa, is thus passed to col_comm parameter of the function and vice versa
+  // (order is swapped in the following call)
+  // It is a bit unfortunate, maybe it should be changed in the Cannon algorithm to comply with ELPA standard notation?
+  cannons_reduction_impl(A, U, np_rows, np_cols, my_prow, my_pcol, a_desc, Res, ToStore, c_col_comm, c_row_comm);
+}
+
diff -Nru elpa-2016.05.001/src/elpa_generated_fortran_interfaces.F90 elpa-2019.11.001/src/elpa_generated_fortran_interfaces.F90
--- elpa-2016.05.001/src/elpa_generated_fortran_interfaces.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_generated_fortran_interfaces.F90	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,65 @@
+!
+!    Copyright 2017, L. Hüdepohl and A. Marek, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+
+#include "config-f90.h"
+
+!#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+!#define FORTRAN_INT_TYPE c_int64_t
+!#else
+!#define FORTRAN_INT_TYPE c_int
+!#endif
+
+#define FORTRAN_INT_TYPE c_int64_t
+
+module elpa_generated_fortran_interfaces
+  use iso_c_binding
+  implicit none
+
+#include "src/elpa_generated_fortran_interfaces.h"
+
+end module
diff -Nru elpa-2016.05.001/src/elpa_impl.F90 elpa-2019.11.001/src/elpa_impl.F90
--- elpa-2016.05.001/src/elpa_impl.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_impl.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1963 @@
+!
+!    Copyright 2017, L. Hüdepohl and A. Marek, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+#include "config-f90.h"
+
+!> \brief Fortran module which provides the actual implementation of the API. Do not use directly! Use the module "elpa"
+module elpa_impl
+  use precision
+  use elpa2_impl
+  use elpa1_impl
+  use elpa1_auxiliary_impl
+  use elpa_mpi
+  use elpa_generated_fortran_interfaces
+  use elpa_utilities, only : error_unit
+#ifdef HAVE_LIKWID
+  use likwid
+#endif
+
+  use elpa_abstract_impl
+#ifdef ENABLE_AUTOTUNING
+  use elpa_autotune_impl
+#endif
+  use, intrinsic :: iso_c_binding
+  use iso_fortran_env
+  implicit none
+
+  private
+  public :: elpa_impl_allocate
+
+!> \brief Definition of the extended elpa_impl_t type
+  type, extends(elpa_abstract_impl_t) :: elpa_impl_t
+   private
+   integer :: communicators_owned
+
+   !This object has been created through the legacy api.
+   integer :: from_legacy_api
+
+   !> \brief methods available with the elpa_impl_t type
+   contains
+     !> \brief the puplic methods
+     ! con-/destructor
+     procedure, public :: setup => elpa_setup                   !< a setup method: implemented in elpa_setup
+     procedure, public :: destroy => elpa_destroy               !< a destroy method: implemented in elpa_destroy
+
+     ! KV store
+     procedure, public :: is_set => elpa_is_set             !< a method to check whether a key/value pair has been set : implemented
+                                                            !< in elpa_is_set
+     procedure, public :: can_set => elpa_can_set           !< a method to check whether a key/value pair can be set : implemented
+                                                            !< in elpa_can_set
+
+     ! call before setup if created from the legacy api
+     ! remove this function completely after the legacy api is dropped
+     procedure, public :: creating_from_legacy_api => elpa_creating_from_legacy_api
+
+     ! timer
+     procedure, public :: get_time => elpa_get_time
+     procedure, public :: print_times => elpa_print_times
+     procedure, public :: timer_start => elpa_timer_start
+     procedure, public :: timer_stop => elpa_timer_stop
+
+
+     !> \brief the implemenation methods
+
+     procedure, public :: elpa_eigenvectors_d                  !< public methods to implement the solve step for real/complex
+                                                               !< double/single matrices
+     procedure, public :: elpa_eigenvectors_f
+     procedure, public :: elpa_eigenvectors_dc
+     procedure, public :: elpa_eigenvectors_fc
+
+     procedure, public :: elpa_eigenvalues_d                   !< public methods to implement the solve step for real/complex
+                                                               !< double/single matrices; only the eigenvalues are computed
+     procedure, public :: elpa_eigenvalues_f
+     procedure, public :: elpa_eigenvalues_dc
+     procedure, public :: elpa_eigenvalues_fc
+
+     procedure, public :: elpa_skew_eigenvectors_d             !< public methods to implement the solve step for real skew-symmetric
+                                                               !< double/single matrices
+     procedure, public :: elpa_skew_eigenvectors_f
+
+     procedure, public :: elpa_skew_eigenvalues_d              !< public methods to implement the solve step for real skew-symmetric
+                                                               !< double/single matrices; only the eigenvalues are computed
+     procedure, public :: elpa_skew_eigenvalues_f
+
+
+     procedure, public :: elpa_generalized_eigenvectors_d      !< public methods to implement the solve step for generalized 
+                                                               !< eigenproblem and real/complex double/single matrices
+     procedure, public :: elpa_generalized_eigenvectors_f
+     procedure, public :: elpa_generalized_eigenvectors_dc
+     procedure, public :: elpa_generalized_eigenvectors_fc
+
+     procedure, public :: elpa_generalized_eigenvalues_d      !< public methods to implement the solve step for generalized 
+                                                              !< eigenproblem and real/complex double/single matrices
+     procedure, public :: elpa_generalized_eigenvalues_f
+     procedure, public :: elpa_generalized_eigenvalues_dc
+     procedure, public :: elpa_generalized_eigenvalues_fc
+
+     procedure, public :: elpa_hermitian_multiply_d      !< public methods to implement a "hermitian" multiplication of matrices a and b
+     procedure, public :: elpa_hermitian_multiply_f            !< for real valued matrices:   a**T * b
+     procedure, public :: elpa_hermitian_multiply_dc           !< for complex valued matrices:   a**H * b
+     procedure, public :: elpa_hermitian_multiply_fc
+
+     procedure, public :: elpa_cholesky_d                      !< public methods to implement the cholesky factorisation of
+                                                               !< real/complex double/single matrices
+     procedure, public :: elpa_cholesky_f
+     procedure, public :: elpa_cholesky_dc
+     procedure, public :: elpa_cholesky_fc
+
+     procedure, public :: elpa_invert_trm_d                    !< public methods to implement the inversion of a triangular
+                                                               !< real/complex double/single matrix
+     procedure, public :: elpa_invert_trm_f
+     procedure, public :: elpa_invert_trm_dc
+     procedure, public :: elpa_invert_trm_fc
+
+     procedure, public :: elpa_solve_tridiagonal_d             !< public methods to implement the solve step for a real valued
+     procedure, public :: elpa_solve_tridiagonal_f             !< double/single tridiagonal matrix
+
+     procedure, public :: associate_int => elpa_associate_int  !< public method to set some pointers
+
+     procedure, private :: elpa_transform_generalized_d
+     procedure, private :: elpa_transform_back_generalized_d
+     procedure, private :: elpa_transform_generalized_dc
+     procedure, private :: elpa_transform_back_generalized_dc
+#ifdef WANT_SINGLE_PRECISION_REAL
+     procedure, private :: elpa_transform_generalized_f
+     procedure, private :: elpa_transform_back_generalized_f
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+     procedure, private :: elpa_transform_generalized_fc
+     procedure, private :: elpa_transform_back_generalized_fc
+#endif
+
+     procedure, public :: print_settings => elpa_print_settings
+     procedure, public :: store_settings => elpa_store_settings
+     procedure, public :: load_settings => elpa_load_settings
+#ifdef ENABLE_AUTOTUNING
+     procedure, public :: autotune_setup => elpa_autotune_setup
+     procedure, public :: autotune_step => elpa_autotune_step
+     procedure, public :: autotune_set_best => elpa_autotune_set_best
+     procedure, public :: autotune_print_best => elpa_autotune_print_best
+     procedure, public :: autotune_print_state => elpa_autotune_print_state
+     procedure, public :: autotune_save_state => elpa_autotune_save_state
+     procedure, public :: autotune_load_state => elpa_autotune_load_state
+#endif
+     procedure, private :: construct_scalapack_descriptor => elpa_construct_scalapack_descriptor
+  end type elpa_impl_t
+
+  !> \brief the implementation of the generic methods
+  contains
+
+
+    !> \brief function to allocate an ELPA object
+    !> Parameters
+    !> \param   error      integer, optional to get an error code
+    !> \result  obj        class(elpa_impl_t) allocated ELPA object
+    function elpa_impl_allocate(error) result(obj)
+      type(elpa_impl_t), pointer     :: obj
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out) :: error
+#else
+      integer, intent(out)           :: error
+#endif
+      integer                        :: error2, output_build_config
+
+      allocate(obj, stat=error2)
+      if (error2 .ne. 0) then
+        write(error_unit, *) "elpa_allocate(): could not allocate object"
+      endif
+
+      obj%from_legacy_api = 0
+
+      ! check whether init has ever been called
+      if ( elpa_initialized() .ne. ELPA_OK) then
+        write(error_unit, *) "elpa_allocate(): you must call elpa_init() once before creating instances of ELPA"
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_API_VERSION
+        endif
+#else
+        error = ELPA_ERROR_API_VERSION
+#endif
+        return
+      endif
+
+      obj%index = elpa_index_instance_c()
+
+      ! Associate some important integer pointers for convenience
+      obj%na => obj%associate_int("na")
+      obj%nev => obj%associate_int("nev")
+      obj%local_nrows => obj%associate_int("local_nrows")
+      obj%local_ncols => obj%associate_int("local_ncols")
+      obj%nblk => obj%associate_int("nblk")
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+    end function
+
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #define elpa_allocate(...) CONC(elpa_allocate, NARGS(__VA_ARGS__))(__VA_ARGS__)
+    !c_o> #endif
+#endif
+    !c> /*! \brief C interface for the implementation of the elpa_allocate method
+    !c> *
+    !c> *  \param  none
+    !c> *  \result elpa_t handle
+    !c> */
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> elpa_t elpa_allocate2(int *error);
+    !c_o> elpa_t elpa_allocate1();
+    !c_o> #endif
+    function elpa_impl_allocate_c1() result(ptr) bind(C, name="elpa_allocate1")
+      type(c_ptr)                :: ptr
+      type(elpa_impl_t), pointer :: obj
+
+      obj => elpa_impl_allocate()
+      ptr = c_loc(obj)
+    end function
+
+    function elpa_impl_allocate_c2(error) result(ptr) bind(C, name="elpa_allocate2")
+      integer(kind=c_int)        :: error
+      type(c_ptr)                :: ptr
+      type(elpa_impl_t), pointer :: obj
+
+      obj => elpa_impl_allocate(error)
+      ptr = c_loc(obj)
+    end function
+#else
+    !c_no> #ifndef OPTIONAL_C_ERROR_ARGUMENT
+    !c_no> elpa_t elpa_allocate(int *error);
+    !c_no> #endif
+    function elpa_impl_allocate_c(error) result(ptr) bind(C, name="elpa_allocate")
+      integer(kind=c_int)        :: error
+      type(c_ptr)                :: ptr
+      type(elpa_impl_t), pointer :: obj
+
+      obj => elpa_impl_allocate(error)
+      ptr = c_loc(obj)
+    end function
+#endif
+
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #define NARGS(...) NARGS_(__VA_ARGS__, 5, 4, 3, 2, 1, 0)
+    !c_o> #define NARGS_(_5, _4, _3, _2, _1, N, ...) N
+    !c_o> #define CONC(A, B) CONC_(A, B)
+    !c_o> #define CONC_(A, B) A##B
+    !c_o> #define elpa_deallocate(...) CONC(elpa_deallocate, NARGS(__VA_ARGS__))(__VA_ARGS__)
+    !c_o> #endif
+#endif
+    !c> /*! \brief C interface for the implementation of the elpa_deallocate method
+    !c> *
+    !c> *  \param  elpa_t  handle of ELPA object to be deallocated
+    !c> *  \param  int*    error code
+    !c> *  \result void
+    !c> */
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> void elpa_deallocate2(elpa_t handle, int *error);
+    !c_o> void elpa_deallocate1(elpa_t handle);
+    !c_o> #endif
+    subroutine elpa_impl_deallocate_c2(handle, error) bind(C, name="elpa_deallocate2")
+      type(c_ptr), value         :: handle
+      type(elpa_impl_t), pointer :: self
+      integer(kind=c_int)        :: error
+
+      call c_f_pointer(handle, self)
+      call self%destroy(error)
+      deallocate(self)
+    end subroutine
+
+    subroutine elpa_impl_deallocate_c1(handle) bind(C, name="elpa_deallocate1")
+      type(c_ptr), value         :: handle
+      type(elpa_impl_t), pointer :: self
+
+      call c_f_pointer(handle, self)
+      call self%destroy()
+      deallocate(self)
+    end subroutine
+#else
+    !c_no> #ifndef OPTIONAL_C_ERROR_ARGUMENT
+    !c_no> void elpa_deallocate(elpa_t handle, int *error);
+    !c_no> #endif
+    subroutine elpa_impl_deallocate_c(handle, error) bind(C, name="elpa_deallocate")
+      type(c_ptr), value         :: handle
+      type(elpa_impl_t), pointer :: self
+      integer(kind=c_int)        :: error
+
+      call c_f_pointer(handle, self)
+      call self%destroy(error)
+      deallocate(self)
+    end subroutine
+
+#endif
+
+    !> \brief function to load all the parameters, which have been saved to a file
+    !> Parameters
+    !> \param   self        class(elpa_impl_t) the allocated ELPA object
+    !> \param   file_name   string, the name of the file from which to load the parameters
+    !> \param   error       integer, optional
+    subroutine elpa_load_settings(self, file_name, error)
+      implicit none
+      class(elpa_impl_t), intent(inout) :: self
+      character(*), intent(in)          :: file_name
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(out)    :: error
+#else
+      integer(kind=c_int), intent(out)              :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      if (elpa_index_load_settings_c(self%index, file_name // c_null_char) /= 1) then
+        write(error_unit, *) "This should not happen (in elpa_load_settings())"
+
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_CANNOT_OPEN_FILE
+        endif
+#else
+        error = ELPA_ERROR_CANNOT_OPEN_FILE
+#endif
+      endif
+    end subroutine
+
+    !c> /*! \brief C interface for the implementation of the elpa_load_settings method
+    !c> *
+    !c> *  \param elpa_t handle
+    !c> *  \param  char* filename
+    !c> */
+    !c> void elpa_load_settings(elpa_t handle, const char *filename, int *error);
+    subroutine elpa_load_settings_c(handle, filename_p, error) bind(C, name="elpa_load_settings")
+      type(c_ptr), value         :: handle
+      type(elpa_impl_t), pointer :: self
+
+      integer(kind=c_int)        :: error
+      type(c_ptr), intent(in), value :: filename_p
+      character(len=elpa_strlen_c(filename_p)), pointer :: filename
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(filename_p, filename)
+      call elpa_load_settings(self, filename, error)
+
+    end subroutine
+
+    !> \brief function to print all the parameters, that have been set
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   error           optional, integer
+    subroutine elpa_print_settings(self, error)
+      implicit none
+      class(elpa_impl_t), intent(inout) :: self
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(out)    :: error
+#else
+      integer(kind=c_int), intent(out)              :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      if (elpa_index_print_settings_c(self%index, c_null_char) /= 1) then
+        write(error_unit, *) "This should not happen (in elpa_print_settings())"
+
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_CRITICAL
+        endif
+#else
+        error = ELPA_ERROR_CRITICAL
+#endif
+      endif
+    end subroutine
+
+    !c> /*! \brief C interface for the implementation of the elpa_print_settings method
+    !c> *
+    !c> *  \param elpa_t handle
+    !c> *  \param  char* filename
+    !c> */
+    !c> void elpa_print_settings(elpa_t handle, int *error);
+    subroutine elpa_print_settings_c(handle, error) bind(C, name="elpa_print_settings")
+      type(c_ptr), value         :: handle
+      type(elpa_impl_t), pointer :: self
+ 
+      integer(kind=c_int)        :: error
+
+      call c_f_pointer(handle, self)
+      call elpa_print_settings(self, error)
+
+    end subroutine
+
+
+    !> \brief function to save all the parameters, that have been set
+    !> Parameters
+    !> \param   self        class(elpa_impl_t) the allocated ELPA object
+    !> \param   file_name   string, the name of the file where to save the parameters
+    !> \param   error       integer, optional
+    subroutine elpa_store_settings(self, file_name, error)
+      implicit none
+      class(elpa_impl_t), intent(inout) :: self
+      character(*), intent(in)          :: file_name
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(out)    :: error
+#else
+      integer(kind=c_int), intent(out)              :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      if (elpa_index_print_settings_c(self%index, file_name // c_null_char) /= 1) then
+        write(error_unit, *) "This should not happen (in elpa_store_settings())"
+
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_CANNOT_OPEN_FILE
+        endif
+#else
+        error = ELPA_ERROR_CANNOT_OPEN_FILE
+#endif
+      endif
+    end subroutine
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_store_settings method
+    !c> *
+    !c> *  \param elpa_t handle
+    !c> *  \param  char* filename
+    !c> */
+    !c> void elpa_store_settings(elpa_t handle, const char *filename, int *error);
+    subroutine elpa_store_settings_c(handle, filename_p, error) bind(C, name="elpa_store_settings")
+      type(c_ptr), value         :: handle
+      type(elpa_impl_t), pointer :: self
+      type(c_ptr), intent(in), value :: filename_p
+      character(len=elpa_strlen_c(filename_p)), pointer :: filename
+      integer(kind=c_int)        :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(filename_p, filename)
+      call elpa_store_settings(self, filename, error)
+
+    end subroutine
+
+
+#ifdef ENABLE_AUTOTUNING
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #define elpa_autotune_deallocate(...) CONC(elpa_autotune_deallocate, NARGS(__VA_ARGS__))(__VA_ARGS__)
+    !c_o> #endif
+#endif
+    !c> /*! \brief C interface for the implementation of the elpa_autotune_deallocate method
+    !c> *
+    !c> *  \param  elpa_autotune_impl_t  handle of ELPA autotune object to be deallocated
+    !c> *  \result void
+    !c> */
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> #ifdef OPTIONAL_C_ERROR_ARGUMENT
+    !c_o> void elpa_autotune_deallocate2(elpa_autotune_t handle, int *error);
+    !c_o> void elpa_autotune_deallocate1(elpa_autotune_t handle);
+    !c_o> #endif
+    subroutine elpa_autotune_impl_deallocate_c1( autotune_handle) bind(C, name="elpa_autotune_deallocate1")
+      type(c_ptr), value                  :: autotune_handle
+
+      type(elpa_autotune_impl_t), pointer :: self
+      integer(kind=c_int)                 :: error
+
+      call c_f_pointer(autotune_handle, self)
+      call self%destroy(error)
+      deallocate(self)
+    end subroutine
+
+    subroutine elpa_autotune_impl_deallocate_c2( autotune_handle, error) bind(C, name="elpa_autotune_deallocate2")
+      type(c_ptr), value                  :: autotune_handle
+
+      type(elpa_autotune_impl_t), pointer :: self
+      integer(kind=c_int)                 :: error
+      call c_f_pointer(autotune_handle, self)
+      call self%destroy(error)
+      deallocate(self)
+    end subroutine
+#else
+    !c_no> #ifndef OPTIONAL_C_ERROR_ARGUMENT
+    !c_no> void elpa_autotune_deallocate(elpa_autotune_t handle, int *error);
+    !c_no> #endif
+    subroutine elpa_autotune_impl_deallocate( autotune_handle, error) bind(C, name="elpa_autotune_deallocate")
+      type(c_ptr), value                  :: autotune_handle
+
+      type(elpa_autotune_impl_t), pointer :: self
+      integer(kind=c_int)                 :: error
+      call c_f_pointer(autotune_handle, self)
+      call self%destroy(error)
+      deallocate(self)
+    end subroutine
+
+#endif
+#endif /* ENABLE_AUTOTUNING */
+
+    !> \brief function to setup an ELPA object and to store the MPI communicators internally
+    !> Parameters
+    !> \param   self       class(elpa_impl_t), the allocated ELPA object
+    !> \result  error      integer, the error code
+    function elpa_setup(self) result(error)
+      class(elpa_impl_t), intent(inout)   :: self
+      integer                             :: error, timings, performance, build_config
+
+#ifdef WITH_MPI
+      integer                             :: mpi_comm_parent, mpi_comm_rows, mpi_comm_cols, np_rows, np_cols, my_id, &
+                                             process_row, process_col, mpi_string_length, &
+                                             present_np_rows, present_np_cols, np_total
+      integer(kind=MPI_KIND)              :: mpierr, mpierr2, my_idMPI, np_totalMPI, process_rowMPI, process_colMPI
+      integer(kind=MPI_KIND)              :: mpi_comm_rowsMPI, mpi_comm_colsMPI, np_rowsMPI, np_colsMPI, &
+                                             mpi_string_lengthMPI
+      character(len=MPI_MAX_ERROR_STRING) :: mpierr_string
+      character(*), parameter             :: MPI_CONSISTENCY_MSG = &
+        "Provide mpi_comm_parent and EITHER process_row and process_col OR mpi_comm_rows and mpi_comm_cols. Aborting..."
+
+#endif
+
+
+#ifdef HAVE_LIKWID
+      !initialize likwid
+      call likwid_markerInit()
+      call likwid_markerThreadInit()
+      call likwid_markerStartRegion("TOTAL")
+#endif
+
+#ifdef HAVE_DETAILED_TIMINGS
+      call self%get("timings",timings, error)
+      call self%get("measure_performance",performance, error)
+      if (check_elpa_get(error, ELPA_ERROR_SETUP)) return
+      if (timings == 1) then
+        call self%timer%enable()
+        if (performance == 1) then
+          call self%timer%measure_flops(.true.)
+          call self%timer%set_print_options(print_flop_count=.true.,print_flop_rate=.true.)
+        endif
+      endif
+#endif
+
+      error = ELPA_OK
+
+      ! In most cases, we actually need the parent communicator to be supplied,
+      ! ELPA internally requires it when either GPU is enabled or when ELPA2 is
+      ! used. It thus seems reasonable that we should ALLWAYS require it. It
+      ! should then be accompanied by EITHER process_row and process_col
+      ! indices, OR mpi_comm_rows and mpi_comm_cols communicators, but NOT both.
+      ! This assumption will significanlty simplify the logic, avoid possible
+      ! inconsistencies and is rather natural from the user point of view
+
+#ifdef WITH_MPI
+      if (self%is_set("mpi_comm_parent") == 1) then
+        call self%get("mpi_comm_parent", mpi_comm_parent, error)
+        if (check_elpa_get(error, ELPA_ERROR_SETUP)) return
+
+        call mpi_comm_rank(int(mpi_comm_parent,kind=MPI_KIND), my_idMPI, mpierr)
+        my_id = int(my_idMPI, kind=c_int)
+        call self%set("process_id", my_id, error)
+        if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+
+        call mpi_comm_size(int(mpi_comm_parent,kind=MPI_KIND), np_totalMPI, mpierr)
+        np_total = int(np_totalMPI,kind=c_int)
+        call self%set("num_processes", np_total, error)
+        if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+      else
+        if (self%from_legacy_api .ne. 1) then
+          write(error_unit,*) MPI_CONSISTENCY_MSG
+          error = ELPA_ERROR
+          return
+        endif
+      endif
+
+      ! Create communicators ourselves
+      if (self%is_set("process_row") == 1 .and. self%is_set("process_col") == 1) then
+
+        if (self%is_set("mpi_comm_rows") == 1 .or. self%is_set("mpi_comm_cols") == 1) then
+          write(error_unit,*) MPI_CONSISTENCY_MSG
+          error = ELPA_ERROR
+          return
+        endif
+
+        call self%get("process_row", process_row, error)
+        if (check_elpa_get(error, ELPA_ERROR_SETUP)) return
+
+        call self%get("process_col", process_col, error)
+        if (check_elpa_get(error, ELPA_ERROR_SETUP)) return
+
+        ! mpi_comm_rows is used for communicating WITHIN rows, i.e. all processes
+        ! having the same column coordinate share one mpi_comm_rows.
+        ! So the "color" for splitting is process_col and the "key" is my row coordinate.
+        ! Analogous for mpi_comm_cols
+
+        call mpi_comm_split(int(mpi_comm_parent,kind=MPI_KIND), int(process_col,kind=MPI_KIND), &
+                            int(process_row,kind=MPI_KIND), mpi_comm_rowsMPI, mpierr)
+        mpi_comm_rows = int(mpi_comm_rowsMPI,kind=c_int)
+        if (mpierr .ne. MPI_SUCCESS) then
+          call MPI_ERROR_STRING(mpierr, mpierr_string, mpi_string_lengthMPI, mpierr2)
+          mpi_string_length = int(mpi_string_lengthMPI, kind=c_int)
+          write(error_unit,*) "MPI ERROR occured during mpi_comm_split for row communicator: ", trim(mpierr_string)
+          return
+        endif
+
+        call mpi_comm_split(int(mpi_comm_parent,kind=MPI_KIND), int(process_row,kind=MPI_KIND), &
+                            int(process_col,kind=MPI_KIND), mpi_comm_colsMPI, mpierr)
+        mpi_comm_cols = int(mpi_comm_colsMPI,kind=c_int)
+        if (mpierr .ne. MPI_SUCCESS) then
+          call MPI_ERROR_STRING(mpierr, mpierr_string, mpi_string_lengthMPI, mpierr2)
+          mpi_string_length = int(mpi_string_lengthMPI, kind=c_int)
+          write(error_unit,*) "MPI ERROR occured during mpi_comm_split for col communicator: ", trim(mpierr_string)
+          return
+        endif
+
+        call self%set("mpi_comm_rows", mpi_comm_rows,error)
+        if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+
+        call self%set("mpi_comm_cols", mpi_comm_cols,error)
+        if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+
+        ! remember that we created those communicators and we need to free them later
+        self%communicators_owned = 1
+
+      ! Externally supplied communicators
+      else if ( self%is_set("mpi_comm_rows") == 1 .and.  self%is_set("mpi_comm_cols") == 1) then
+
+        if (self%is_set("process_row") == 1 .or. self%is_set("process_col") == 1) then
+          write(error_unit,*) MPI_CONSISTENCY_MSG
+          error = ELPA_ERROR
+          return
+        endif
+
+        call self%get("mpi_comm_rows", mpi_comm_rows,error)
+        if (check_elpa_get(error, ELPA_ERROR_SETUP)) return
+
+        call self%get("mpi_comm_cols", mpi_comm_cols,error)
+        if (check_elpa_get(error, ELPA_ERROR_SETUP)) return
+
+        process_rowMPI = int(process_row,kind=c_int)
+        call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), process_rowMPI, mpierr)
+        process_row = int(process_rowMPI,kind=MPI_KIND)
+        call self%set("process_row", process_row, error)
+        if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+
+        process_colMPI = int(process_col,kind=c_int)
+        call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND), process_colMPI, mpierr)
+        process_col = int(process_colMPI,kind=MPI_KIND)
+        call self%set("process_col", process_col, error)
+        if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+
+        ! remember that we DID NOT created those communicators and we WILL NOT free them later
+        self%communicators_owned = 0
+      else
+        ! Otherwise parameters are missing
+        write(error_unit,*) MPI_CONSISTENCY_MSG
+        error = ELPA_ERROR
+        return
+      endif
+
+      ! set num_process_rows (and cols), if they are not supplied. Check them
+      ! for consistency if they are. Maybe we could instead require, that they
+      ! are never supplied?
+      call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
+      np_rows = int(np_rowsMPI, kind=c_int)
+      if (self%is_set("num_process_rows") == 1) then
+        call self%get("num_process_rows", present_np_rows, error)
+        if (check_elpa_get(error, ELPA_ERROR_SETUP)) return
+
+        if (np_rows .ne. present_np_rows) then
+          print *,"MPI row communicator not set correctly. Aborting..."
+          stop
+        endif
+      else
+        call self%set("num_process_rows", np_rows, error)
+        if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+      endif
+
+      call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
+      np_cols = int(np_colsMPI, kind=c_int)
+      if (self%is_set("num_process_cols") == 1) then
+        call self%get("num_process_cols", present_np_cols, error)
+        if (check_elpa_get(error, ELPA_ERROR_SETUP)) return
+
+        if (np_cols .ne. present_np_cols) then
+          print *,"MPI column communicator not set correctly. Aborting..."
+          stop
+        endif
+      else
+        call self%set("num_process_cols", np_cols, error)
+        if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+      endif
+
+      if (self%from_legacy_api .ne. 1) then
+        if (np_total .ne. np_rows * np_cols) then
+          print *,"MPI parent communicator and row/col communicators do not match. Aborting..."
+          stop
+        endif
+      endif
+
+#else
+      call self%set("process_row", 0, error)
+      if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+      call self%set("process_col", 0, error)
+      if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+      call self%set("process_id", 0, error)
+      if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+      call self%set("num_process_rows", 1, error)
+      if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+      call self%set("num_process_cols", 1, error)
+      if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+      call self%set("num_processes", 1, error)
+      if (check_elpa_set(error, ELPA_ERROR_SETUP)) return
+#endif
+
+#if STORE_BUILD_CONFIG
+      call self%get("output_build_config",build_config, error)
+      if ( build_config .eq. 1) then
+#ifdef WITH_MPI
+        if (my_id .eq. 0) then
+#endif
+          call print_build_config()
+#ifdef WITH_MPI
+        endif
+#endif
+      endif
+#endif
+    end function
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_setup method
+    !c> *
+    !c> *  \param  elpa_t  handle of the ELPA object which describes the problem to
+    !c> *                  be set up
+    !c> *  \result int     error code, which can be queried with elpa_strerr
+    !c> */
+    !c> int elpa_setup(elpa_t handle);
+    function elpa_setup_c(handle) result(error) bind(C, name="elpa_setup")
+      type(c_ptr), intent(in), value :: handle
+      type(elpa_impl_t), pointer :: self
+      integer(kind=c_int) :: error
+
+      call c_f_pointer(handle, self)
+      error = self%setup()
+    end function
+
+    function elpa_construct_scalapack_descriptor(self, sc_desc, rectangular_for_ev) result(error)
+      class(elpa_impl_t), intent(inout)   :: self
+      logical, intent(in)                 :: rectangular_for_ev
+      integer                             :: error, blacs_ctx
+      integer, intent(out)                :: sc_desc(SC_DESC_LEN)
+
+#ifdef WITH_MPI
+      if (self%is_set("blacs_context") == 0) then
+        print *,"BLACS context has not been set beforehand. Aborting..."
+        stop
+      endif
+      call self%get("blacs_context", blacs_ctx, error)
+      if (check_elpa_get(error, ELPA_ERROR_CRITICAL)) return
+
+      sc_desc(1) = 1
+      sc_desc(2) = blacs_ctx
+      sc_desc(3) = self%na
+      if (rectangular_for_ev) then
+        sc_desc(4) = self%nev
+      else
+        sc_desc(4) = self%na
+      endif
+      sc_desc(5) = self%nblk
+      sc_desc(6) = self%nblk
+      sc_desc(7) = 0
+      sc_desc(8) = 0
+      sc_desc(9) = self%local_nrows
+#else
+      sc_desc = 0
+#endif
+      error = ELPA_OK
+    end function
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_set_integer method
+    !c> *  This method is available to the user as C generic elpa_set method
+    !c> *
+    !c> *  \param  handle  handle of the ELPA object for which a key/value pair should be set
+    !c> *  \param  name    the name of the key
+    !c> *  \param  value   the value to be set for the key
+    !c> *  \param  error   on return the error code, which can be queried with elpa_strerr()
+    !c> *  \result void
+    !c> */
+    !c> void elpa_set_integer(elpa_t handle, const char *name, int value, int *error);
+    subroutine elpa_set_integer_c(handle, name_p, value, error) bind(C, name="elpa_set_integer")
+      type(c_ptr), intent(in), value                :: handle
+      type(elpa_impl_t), pointer                    :: self
+      type(c_ptr), intent(in), value                :: name_p
+      character(len=elpa_strlen_c(name_p)), pointer :: name
+      integer(kind=c_int), intent(in), value        :: value
+      integer(kind=c_int) , intent(in)              :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(name_p, name)
+      call elpa_set_integer(self, name, value, error)
+    end subroutine
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_get_integer method
+    !c> *  This method is available to the user as C generic elpa_get method
+    !c> *
+    !c> *  \param  handle  handle of the ELPA object for which a key/value pair should be queried
+    !c> *  \param  name    the name of the key
+    !c> *  \param  value   the value to be obtain for the key
+    !c> *  \param  error   on return the error code, which can be queried with elpa_strerr()
+    !c> *  \result void
+    !c> */
+    !c> void elpa_get_integer(elpa_t handle, const char *name, int *value, int *error);
+    subroutine elpa_get_integer_c(handle, name_p, value, error) bind(C, name="elpa_get_integer")
+      type(c_ptr), intent(in), value                :: handle
+      type(elpa_impl_t), pointer                    :: self
+      type(c_ptr), intent(in), value                :: name_p
+      character(len=elpa_strlen_c(name_p)), pointer :: name
+      integer(kind=c_int)                           :: value
+      integer(kind=c_int), intent(inout)            :: error
+ 
+      call c_f_pointer(handle, self)
+      call c_f_pointer(name_p, name)
+      call elpa_get_integer(self, name, value, error)
+    end subroutine
+
+
+    !> \brief function to check whether a key/value pair is set
+    !> Parameters
+    !> \param   self       class(elpa_impl_t) the allocated ELPA object
+    !> \param   name       string, the key
+    !> \result  state      integer, the state of the key/value pair
+    function elpa_is_set(self, name) result(state)
+      class(elpa_impl_t)       :: self
+      character(*), intent(in) :: name
+      integer                  :: state
+
+      state = elpa_index_value_is_set_c(self%index, name // c_null_char)
+    end function
+
+    !> \brief function to check whether a key/value pair can be set
+    !> Parameters
+    !> \param   self       class(elpa_impl_t) the allocated ELPA object
+    !> \param   name       string, the key
+    !> \param   value      integer, value
+    !> \result  error      integer, error code
+    function elpa_can_set(self, name, value) result(error)
+      class(elpa_impl_t)       :: self
+      character(*), intent(in) :: name
+      integer(kind=c_int), intent(in) :: value
+      integer                  :: error
+
+      error = elpa_index_int_is_valid_c(self%index, name // c_null_char, value)
+    end function
+
+
+    !> \brief function to convert a value to an human readable string
+    !> Parameters
+    !> \param   self        class(elpa_impl_t) the allocated ELPA object
+    !> \param   option_name string: the name of the options, whose value should be converted
+    !> \param   error       integer: errpr code
+    !> \result  string      string: the humanreadable string   
+    function elpa_value_to_string(self, option_name, error) result(string)
+      class(elpa_impl_t), intent(in) :: self
+      character(kind=c_char, len=*), intent(in) :: option_name
+      type(c_ptr) :: ptr
+#ifdef USE_FORTRAN2008
+      integer, intent(out), optional :: error
+#else
+      integer, intent(out)           :: error
+#endif
+
+      integer :: val, actual_error
+      character(kind=c_char, len=elpa_index_int_value_to_strlen_c(self%index, option_name // C_NULL_CHAR)), pointer :: string
+
+      nullify(string)
+
+      call self%get(option_name, val, actual_error)
+      if (actual_error /= ELPA_OK) then
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = actual_error
+        endif
+#else
+          error = actual_error
+#endif
+        return
+      endif
+
+      actual_error = elpa_int_value_to_string_c(option_name // C_NULL_CHAR, val, ptr)
+      if (c_associated(ptr)) then
+        call c_f_pointer(ptr, string)
+      endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = actual_error
+      endif
+#else
+        error = actual_error
+#endif
+    end function
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_set_double method
+    !c> *  This method is available to the user as C generic elpa_set method
+    !c> *
+    !c> *  \param  handle  handle of the ELPA object for which a key/value pair should be set
+    !c> *  \param  name    the name of the key
+    !c> *  \param  value   the value to be set for the key
+    !c> *  \param  error   on return the error code, which can be queried with elpa_strerr()
+    !c> *  \result void
+    !c> */
+    !c> void elpa_set_double(elpa_t handle, const char *name, double value, int *error);
+    subroutine elpa_set_double_c(handle, name_p, value, error) bind(C, name="elpa_set_double")
+      type(c_ptr), intent(in), value                :: handle
+      type(elpa_impl_t), pointer                    :: self
+      type(c_ptr), intent(in), value                :: name_p
+      character(len=elpa_strlen_c(name_p)), pointer :: name
+      real(kind=c_double), intent(in), value        :: value
+      integer(kind=c_int), intent(in)               :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(name_p, name)
+      call elpa_set_double(self, name, value, error)
+    end subroutine
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_get_double method
+    !c> *  This method is available to the user as C generic elpa_get method
+    !c> *
+    !c> *  \param  handle  handle of the ELPA object for which a key/value pair should be queried
+    !c> *  \param  name    the name of the key
+    !c> *  \param  value   the value to be obtain for the key
+    !c> *  \param  error   on return the error code, which can be queried with elpa_strerr()
+    !c> *  \result void
+    !c> */
+    !c> void elpa_get_double(elpa_t handle, const char *name, double *value, int *error);
+    subroutine elpa_get_double_c(handle, name_p, value, error) bind(C, name="elpa_get_double")
+      type(c_ptr), intent(in), value                :: handle
+      type(elpa_impl_t), pointer                    :: self
+      type(c_ptr), intent(in), value                :: name_p
+      character(len=elpa_strlen_c(name_p)), pointer :: name
+      real(kind=c_double)                           :: value
+      integer(kind=c_int), intent(inout)            :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(name_p, name)
+      call elpa_get_double(self, name, value, error)
+    end subroutine
+ 
+
+    !> \brief function to associate a pointer with an integer value
+    !> Parameters
+    !> \param   self        class(elpa_impl_t) the allocated ELPA object
+    !> \param   name        string: the name of the entry
+    !> \result  value       integer, pointer: the value for the entry
+    function elpa_associate_int(self, name) result(value)
+      class(elpa_impl_t)             :: self
+      character(*), intent(in)       :: name
+      integer(kind=c_int), pointer   :: value
+
+      type(c_ptr)                    :: value_p
+
+      value_p = elpa_index_get_int_loc_c(self%index, name // c_null_char)
+      if (.not. c_associated(value_p)) then
+        write(error_unit, '(a,a,a)') "ELPA: Warning, received NULL pointer for entry '", name, "'"
+      endif
+      call c_f_pointer(value_p, value)
+    end function
+
+
+    !> \brief function to querry the timing information at a certain level
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   name1 .. name6  string: the string identifier for the timer region.
+    !>                                  at the moment 6 nested levels can be queried
+    !> \result  s               double: the timer metric for the region. Might be seconds,
+    !>                                  or any other supported metric
+    function elpa_get_time(self, name1, name2, name3, name4, name5, name6) result(s)
+      class(elpa_impl_t), intent(in) :: self
+      ! this is clunky, but what can you do..
+      character(len=*), intent(in), optional :: name1, name2, name3, name4, name5, name6
+      real(kind=c_double) :: s
+
+#ifdef HAVE_DETAILED_TIMINGS
+      s = self%timer%get(name1, name2, name3, name4, name5, name6)
+#else
+      s = -1.0
+#endif
+    end function
+
+
+    !> \brief function to print the timing tree below at a certain level
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   name1 .. name6  string: the string identifier for the timer region.
+    !>                                  at the moment 4 nested levels can be specified
+    subroutine elpa_print_times(self, name1, name2, name3, name4)
+      class(elpa_impl_t), intent(in) :: self
+      character(len=*), intent(in), optional :: name1, name2, name3, name4
+#ifdef HAVE_DETAILED_TIMINGS
+      call self%timer%print(name1, name2, name3, name4)
+#endif
+    end subroutine
+
+
+    !> \brief function to start the timing of a code region
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   name            string: a chosen identifier name for the code region
+    subroutine elpa_timer_start(self, name)
+      class(elpa_impl_t), intent(inout) :: self
+      character(len=*), intent(in) :: name
+#ifdef HAVE_DETAILED_TIMINGS
+      call self%timer%start(name)
+#endif
+    end subroutine
+
+
+    !> \brief function to stop the timing of a code region
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   name            string: identifier name for the code region to stop
+    subroutine elpa_timer_stop(self, name)
+      class(elpa_impl_t), intent(inout) :: self
+      character(len=*), intent(in) :: name
+#ifdef HAVE_DETAILED_TIMINGS
+      call self%timer%stop(name)
+#endif
+    end subroutine
+
+
+    !> \brief function to destroy an elpa object
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   error           integer, optional error code
+    subroutine elpa_destroy(self, error)
+#ifdef WITH_MPI
+      integer                              :: mpi_comm_rows, mpi_comm_cols, &
+                                              mpi_string_length
+      integer(kind=MPI_KIND)               :: mpierr, mpierr2, mpi_string_lengthMPI, &
+                                              mpi_comm_rowsMPI, mpi_comm_colsMPI
+      character(len=MPI_MAX_ERROR_STRING)  :: mpierr_string
+#endif
+      class(elpa_impl_t)                   :: self
+#ifdef USE_FORTRAN2008
+      integer, optional, intent(out)       :: error
+#else
+      integer, intent(out)                 :: error
+#endif
+      integer                              :: error2
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+
+#ifdef HAVE_LIKWID
+      call likwid_markerStopRegion("TOTAL")
+      call likwid_markerClose()
+#endif
+
+#ifdef WITH_MPI
+      if (self%communicators_owned == 1) then
+        call self%get("mpi_comm_rows", mpi_comm_rows, error2)
+        if (error2 .ne. ELPA_OK) then
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = error2
+          else
+            write(error_unit, *) "Error in elpa_destroy but you do not check the error codes!"
+          endif
+#else
+          error = error2
+#endif
+          return
+        endif ! error happend
+
+        call self%get("mpi_comm_cols", mpi_comm_cols,error2)
+        if (error2 .ne. ELPA_OK) then
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = error2
+          else
+            write(error_unit, *) "Error in elpa_destroy but you do not check the error codes!"
+          endif
+#else
+          error = error2
+#endif
+          return
+        endif ! error happend
+
+        ! this is just for debugging ! do not leave in a relase
+        !write(error_unit, '(A,2I13)') "FREE comms", mpi_comm_rows, mpi_comm_cols
+        mpi_comm_rowsMPI = int(mpi_comm_rows,kind=MPI_KIND)
+        call mpi_comm_free(mpi_comm_rowsMPI, mpierr)
+        mpi_comm_rows = int(mpi_comm_rowsMPI,kind=c_int)
+        if (mpierr .ne. MPI_SUCCESS) then
+          call MPI_ERROR_STRING(mpierr, mpierr_string, mpi_string_lengthMPI, mpierr2)
+          mpi_string_length = int(mpi_string_lengthMPI,kind=c_int)
+          write(error_unit,*) "MPI ERROR occured during mpi_comm_free for row communicator: ", trim(mpierr_string)
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR_CRITICAL
+          endif
+#else
+          error = ELPA_ERROR_CRITICAL
+#endif
+          return
+        endif ! mpierr happend
+        call self%set("mpi_comm_cols", -12345, error2)
+        if (error2 .ne. ELPA_OK) then
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = error2
+          else
+            write(error_unit, *) "Error in elpa_destroy but you do not check the error codes!"
+          endif
+#else
+          error = error2
+#endif
+          return
+        endif ! error happend
+        mpi_comm_colsMPI = int(mpi_comm_cols,kind=MPI_KIND)
+        call mpi_comm_free(mpi_comm_colsMPI, mpierr)
+        mpi_comm_cols = int(mpi_comm_colsMPI, kind=c_int)
+        if (mpierr .ne. MPI_SUCCESS) then
+          call MPI_ERROR_STRING(mpierr, mpierr_string, mpi_string_lengthMPI, mpierr2)
+          mpi_string_length = int(mpi_string_lengthMPI,kind=c_int)
+          write(error_unit,*) "MPI ERROR occured during mpi_comm_free for col communicator: ", trim(mpierr_string)
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR_CRITICAL
+          endif
+#else
+          error = ELPA_ERROR_CRITICAL
+#endif
+          return
+        endif ! mpierr happend
+        call self%set("mpi_comm_rows", -12345,error2)
+        if (error2 .ne. ELPA_OK) then
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = error2
+          else
+            write(error_unit, *) "Error in elpa_destroy but you do not check the error codes!"
+          endif
+#else
+          error = error2
+#endif
+          return
+        endif ! error happend
+      endif
+#endif /* WITH_MPI */
+
+      call timer_free(self%timer)
+      call timer_free(self%autotune_timer)
+      call elpa_index_free_c(self%index)
+
+    end subroutine
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#define INCLUDE_ROUTINES 1
+#include "general/precision_macros.h"
+#include "elpa_impl_math_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+#undef INCLUDE_ROUTINES
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define INCLUDE_ROUTINES 1
+#endif
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "general/precision_macros.h"
+#include "elpa_impl_math_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#undef INCLUDE_ROUTINES
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#define INCLUDE_ROUTINES 1
+#include "general/precision_macros.h"
+#include "elpa_impl_math_template.F90"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+#undef INCLUDE_ROUTINES
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#define INCLUDE_ROUTINES 1
+#endif
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION
+#include "general/precision_macros.h"
+#include "elpa_impl_math_template.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#undef INCLUDE_ROUTINES
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "general/precision_macros.h"
+#include "elpa_impl_generalized_transform_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "general/precision_macros.h"
+#include "elpa_impl_generalized_transform_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+#endif
+
+#define COMPLEXCASE 1
+
+#define DOUBLE_PRECISION 1
+#include "general/precision_macros.h"
+#include "elpa_impl_generalized_transform_template.F90"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION
+#include "general/precision_macros.h"
+#include "elpa_impl_generalized_transform_template.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+#endif
+
+
+!    function use_cannons_algorithm(self) result(use_cannon, do_print)
+!      class(elpa_impl_t), intent(inout), target :: self
+!      logical                                   :: use_cannon
+!      logical, intent(in)                       :: do_print
+!    end function
+!
+
+
+
+
+#ifdef ENABLE_AUTOTUNING
+    !> \brief function to setup the ELPA autotuning and create the autotune object
+    !> Parameters
+    !> \param   self            the allocated ELPA object
+    !> \param   level           integer: the "thoroughness" of the planed autotuning
+    !> \param   domain          integer: the domain (real/complex) which should be tuned
+    !> \result  tune_state      the created autotuning object
+    function elpa_autotune_setup(self, level, domain, error) result(tune_state)
+      class(elpa_impl_t), intent(inout), target :: self
+      integer, intent(in)                       :: level, domain
+      type(elpa_autotune_impl_t), pointer       :: ts_impl
+      class(elpa_autotune_t), pointer           :: tune_state
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional             :: error
+#else
+      integer(kind=c_int)                       :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+
+      if (elpa_get_api_version() < EARLIEST_AUTOTUNE_VERSION) then
+        write(error_unit, "(a,i0,a)") "ELPA: Error API version: Autotuning does not support ", elpa_get_api_version()
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_AUTOTUNE_API_VERSION
+        endif
+#else
+        error = ELPA_ERROR_AUTOTUNE_API_VERSION
+#endif
+        return
+      endif
+
+      allocate(ts_impl)
+      ts_impl%parent => self
+      ts_impl%level = level
+      ts_impl%domain = domain
+
+      ts_impl%current = -1
+      ts_impl%min_loc = -1
+      ts_impl%cardinality = elpa_index_autotune_cardinality_c(self%index, level, domain)
+
+      tune_state => ts_impl
+
+      call self%autotune_timer%enable()
+    end function
+
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_autotune_setup method
+    !c> *
+    !c> *  \param  elpa_t           handle: of the ELPA object which should be tuned
+    !c> *  \param  int              level:  "thoroughness" of autotuning
+    !c> *  \param  int              domain: real/complex autotuning
+    !c> *  \result elpa_autotune_t  handle:  on the autotune object
+    !c> */
+    !c> elpa_autotune_t elpa_autotune_setup(elpa_t handle, int level, int domain, int *error);
+    function elpa_autotune_setup_c(handle ,level, domain, error) result(ptr) bind(C, name="elpa_autotune_setup")
+      type(c_ptr), intent(in), value         :: handle
+      type(elpa_impl_t), pointer             :: self
+      class(elpa_autotune_t), pointer        :: tune_state
+      type(elpa_autotune_impl_t), pointer    :: obj        
+      integer(kind=c_int), intent(in), value :: level
+      integer(kind=c_int), intent(in), value :: domain
+      type(c_ptr)                            :: ptr
+      integer(kind=c_int) , intent(in)       :: error
+
+      call c_f_pointer(handle, self)
+
+      tune_state => self%autotune_setup(level, domain, error)
+      select type(tune_state)
+        type is (elpa_autotune_impl_t)
+          obj => tune_state
+        class default
+          print *, "This should not happen"
+          stop
+      end select                
+      ptr = c_loc(obj)
+
+    end function
+
+
+    !> \brief function to do an autotunig step
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   tune_state      class(elpa_autotune_t): the autotuning object
+    !> \result  unfinished      logical: describes the state of the autotuning (completed/uncompleted)
+    function elpa_autotune_step(self, tune_state, error) result(unfinished)
+      implicit none
+      class(elpa_impl_t), intent(inout)             :: self
+      class(elpa_autotune_t), intent(inout), target :: tune_state
+      type(elpa_autotune_impl_t), pointer           :: ts_impl
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(out)    :: error
+#else
+      integer(kind=c_int),  intent(out)             :: error
+#endif
+      integer(kind=c_int)                           :: error2, error3
+      integer                                       :: mpi_comm_parent, mpi_string_length, np_total
+      integer(kind=MPI_KIND)                        :: mpierr, mpierr2, mpi_string_lengthMPI
+      logical                                       :: unfinished
+      integer                                       :: i
+      real(kind=C_DOUBLE)                           :: time_spent, sendbuf(1), recvbuf(1)
+#ifdef WITH_MPI
+      character(len=MPI_MAX_ERROR_STRING)           :: mpierr_string
+#endif
+
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      select type(tune_state)
+        type is (elpa_autotune_impl_t)
+          ts_impl => tune_state
+        class default
+          print *, "This should not happen"
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR
+          endif
+#else
+          error = ELPA_ERROR
+#endif
+      end select
+
+      unfinished = .false.
+
+      if (ts_impl%current >= 0) then
+#ifdef HAVE_DETAILED_TIMINGS
+        time_spent = self%autotune_timer%get("accumulator")
+#else
+        print *, "Cannot do autotuning without detailed timings"
+
+        ! TODO check this. Do we really want to return only if error is present? And should it be ELPA_OK?
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_CRITICAL
+        endif
+#else
+        error = ELPA_OK
+#endif
+        return
+#endif /* HAVE_DETAILED_TIMINGS */
+
+#ifdef WITH_MPI
+        ! find the average time spent .. we need a unique value on all ranks
+        call self%get("mpi_comm_parent", mpi_comm_parent, error2)
+        call self%get("num_processes", np_total, error3)
+        if ((error2 .ne. ELPA_OK) .or. (error3 .ne. ELPA_OK)) then
+          print *, "Parent communicator is not set properly. Aborting..."
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR_CRITICAL
+          endif
+#else
+          error = ELPA_ERROR_CRITICAL
+#endif
+          return
+        endif
+
+        sendbuf(1) = time_spent
+        call MPI_Allreduce(sendbuf, recvbuf, 1_MPI_KIND, MPI_REAL8, MPI_SUM, int(mpi_comm_parent,kind=MPI_KIND), mpierr)
+        if (mpierr .ne. MPI_SUCCESS) then
+          call MPI_ERROR_STRING(mpierr, mpierr_string, mpi_string_lengthMPI, mpierr2)
+          mpi_string_length = int(mpi_string_lengthMPI,kind=c_int)
+          write(error_unit,*) "MPI ERROR occured during elpa_autotune_step: ", trim(mpierr_string)
+          return
+        endif
+        time_spent = recvbuf(1) / np_total
+#endif /* WITH_MPI */
+
+        if (ts_impl%min_loc == -1 .or. (time_spent < ts_impl%min_val)) then
+          ts_impl%min_val = time_spent
+          ts_impl%min_loc = ts_impl%current
+        end if
+        call self%autotune_timer%free()
+      endif ! (ts_impl%current >= 0)
+
+      do while (ts_impl%current < ts_impl%cardinality - 1)
+        ts_impl%current = ts_impl%current + 1
+        if (elpa_index_set_autotune_parameters_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%current) == 1) then
+          unfinished = .true.
+          return
+        end if
+      end do
+
+    end function
+
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_autotune_step method
+    !c> *
+    !c> *  \param  elpa_t           handle: of the ELPA object which should be tuned
+    !c> *  \param  elpa_autotune_t  autotune_handle: the autotuning object
+    !c> *  \param  error            int *error code
+    !c> *  \result int              unfinished:  describes whether autotuning finished (0) or not (1)
+    !c> */
+    !c> int elpa_autotune_step(elpa_t handle, elpa_autotune_t autotune_handle, int *error);
+    function elpa_autotune_step_c(handle, autotune_handle, &
+                    error) result(unfinished) bind(C, name="elpa_autotune_step")
+      type(c_ptr), intent(in), value       :: handle
+      type(c_ptr), intent(in), value       :: autotune_handle
+      type(elpa_impl_t), pointer           :: self
+      type(elpa_autotune_impl_t), pointer  :: tune_state
+      logical                              :: unfinished_f
+      integer(kind=c_int)                  :: unfinished
+      integer(kind=c_int)                  :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(autotune_handle, tune_state)
+
+      unfinished_f = self%autotune_step(tune_state, error)
+      if (unfinished_f) then
+        unfinished = 1
+      else
+        unfinished = 0
+      endif
+
+    end function
+
+    !> \brief function to set the up-to-now best options of the autotuning
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   tune_state      class(elpa_autotune_t): the autotuning object
+    !> \param   error code      optional, integer
+    subroutine elpa_autotune_set_best(self, tune_state, error)
+      implicit none
+      class(elpa_impl_t), intent(inout)          :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+      type(elpa_autotune_impl_t), pointer        :: ts_impl
+#ifdef USE_FORTRAN2008
+      integer(kind=ik), optional, intent(out)    :: error
+#else
+      integer(kind=ik), intent(out)              :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      select type(tune_state)
+        type is (elpa_autotune_impl_t)
+          ts_impl => tune_state
+        class default
+          write(error_unit, *) "This should not happen! Critical error"
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR_CRITICAL
+          endif
+#else
+          error = ELPA_ERROR_CRITICAL
+#endif
+      end select
+
+      if (elpa_index_set_autotune_parameters_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc) /= 1) then
+        write(error_unit, *) "This should not happen (in elpa_autotune_set_best())"
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_AUTOTUNE_OBJECT_CHANGED
+        endif
+#else
+        error = ELPA_ERROR_AUTOTUNE_OBJECT_CHANGED
+#endif
+      endif
+    end subroutine
+
+
+
+    !> \brief function to print the up-to-now best options of the autotuning
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   tune_state      class(elpa_autotune_t): the autotuning object
+    !> \param   error           integer, optional
+    subroutine elpa_autotune_print_best(self, tune_state, error)
+      implicit none
+      class(elpa_impl_t), intent(inout)          :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+      type(elpa_autotune_impl_t), pointer        :: ts_impl
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(out) :: error
+#else
+      integer(kind=c_int),  intent(out)          :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      select type(tune_state)
+        type is (elpa_autotune_impl_t)
+          ts_impl => tune_state
+        class default
+          write(error_unit, *) "This should not happen! Critical error"
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR_CRITICAL
+          endif
+#else
+          error = ELPA_ERROR_CRITICAL
+#endif
+      end select
+
+      !print *, "The following settings were found to be best:"
+      !print *, "Best, i = ", ts_impl%min_loc, "best time = ", ts_impl%min_val
+      flush(output_unit)
+      if (elpa_index_print_autotune_parameters_c(self%index, ts_impl%level, ts_impl%domain) /= 1) then
+        write(error_unit, *) "This should not happen (in elpa_autotune_print_best())"
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_AUTOTUNE_OBJECT_CHANGED
+        endif
+#else
+        error = ELPA_ERROR_AUTOTUNE_OBJECT_CHANGED
+#endif
+      endif
+    end subroutine
+
+
+
+    !> \brief function to print the state of the autotuning
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   tune_state      class(elpa_autotune_t): the autotuning object
+    !> \param   error           integer, optional
+    subroutine elpa_autotune_print_state(self, tune_state, error)
+      implicit none
+      class(elpa_impl_t), intent(inout)          :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+      type(elpa_autotune_impl_t), pointer        :: ts_impl
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(out) :: error
+#else
+      integer(kind=c_int), intent(out)           :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      select type(tune_state)
+        type is (elpa_autotune_impl_t)
+          ts_impl => tune_state
+        class default
+          write(error_unit, *) "This should not happen! Critical erro"
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR_CRITICAL
+          endif
+#else
+          error = ELPA_ERROR_CRITICAL
+#endif
+      end select
+
+      if (elpa_index_print_autotune_state_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc, &
+                  ts_impl%min_val, ts_impl%current, ts_impl%cardinality, c_null_char) /= 1) then
+        write(error_unit, *) "This should not happen (in elpa_autotune_print_state())"
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_AUTOTUNE_OBJECT_CHANGED
+        endif
+#else
+        error = ELPA_ERROR_AUTOTUNE_OBJECT_CHANGED
+#endif
+      endif
+    end subroutine
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_autotune_print_state method
+    !c> *
+    !c> *  \param  elpa_t           handle: of the ELPA object which should be tuned
+    !c> *  \param  elpa_autotune_t  autotune_handle: the autotuning object
+    !c> *  \param  error            int *
+    !c> *  \result none 
+    !c> */
+    !c> void elpa_autotune_print_state(elpa_t handle, elpa_autotune_t autotune_handle, int *error);
+    subroutine elpa_autotune_print_state_c(handle, autotune_handle, error) bind(C, name="elpa_autotune_print_state")
+      type(c_ptr), intent(in), value       :: handle
+      type(c_ptr), intent(in), value       :: autotune_handle
+      type(elpa_impl_t), pointer           :: self
+      type(elpa_autotune_impl_t), pointer  :: tune_state
+      integer(kind=c_int)                  :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(autotune_handle, tune_state)
+
+      call self%autotune_print_state(tune_state, error)
+
+    end subroutine
+
+
+
+    !> \brief function to save the state of the autotuning
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   tune_state      class(elpa_autotune_t): the autotuning object
+    !> \param   file_name       string, the name of the file where to save the state
+    !> \param   error           integer, optional
+    subroutine elpa_autotune_save_state(self, tune_state, file_name, error)
+      implicit none
+      class(elpa_impl_t), intent(inout)          :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+      type(elpa_autotune_impl_t), pointer        :: ts_impl
+      character(*), intent(in)                   :: file_name
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(out) :: error
+#else
+      integer(kind=c_int), intent(out)           :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      select type(tune_state)
+        type is (elpa_autotune_impl_t)
+          ts_impl => tune_state
+        class default
+          write(error_unit, *) "This should not happen! Critical error"
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR_CRITICAL
+          endif
+#else
+          error = ELPA_ERROR_CRITICAL
+#endif
+      end select
+
+      if (elpa_index_print_autotune_state_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc, &
+                  ts_impl%min_val, ts_impl%current, ts_impl%cardinality, file_name // c_null_char) /= 1) then
+        write(error_unit, *) "This should not happen (in elpa_autotune_save_state())"
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = ELPA_ERROR_CANNOT_OPEN_FILE
+        endif
+#else
+        error = ELPA_ERROR_CANNOT_OPEN_FILE
+#endif
+      endif
+    end subroutine
+
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_autotune_save_state method
+    !c> *
+    !c> *  \param  elpa_t           handle: of the ELPA object which should be tuned
+    !c> *  \param  elpa_autotune_t  autotune_handle: the autotuning object
+    !c> *  \param  error            int *
+    !c> *  \result none 
+    !c> */
+    !c> void elpa_autotune_save_state(elpa_t handle, elpa_autotune_t autotune_handle, const char *filename, int *error);
+    subroutine elpa_autotune_save_state_c(handle, autotune_handle, filename_p, error) bind(C, name="elpa_autotune_save_state")
+      type(c_ptr), intent(in), value       :: handle
+      type(c_ptr), intent(in), value       :: autotune_handle
+      type(elpa_impl_t), pointer           :: self
+      type(elpa_autotune_impl_t), pointer  :: tune_state
+      type(c_ptr), intent(in), value       :: filename_p
+      character(len=elpa_strlen_c(filename_p)), pointer :: filename
+      integer(kind=c_int)                  :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(filename_p, filename)
+      call c_f_pointer(autotune_handle, tune_state)
+
+      call self%autotune_save_state(tune_state, filename, error)
+
+    end subroutine
+
+
+
+    !> \brief function to load the state of the autotuning
+    !> Parameters
+    !> \param   self            class(elpa_impl_t) the allocated ELPA object
+    !> \param   tune_state      class(elpa_autotune_t): the autotuning object
+    !> \param   file_name       string, the name of the file from which to load the state
+    !> \param   error           integer, optional
+    subroutine elpa_autotune_load_state(self, tune_state, file_name, error)
+      implicit none
+      class(elpa_impl_t), intent(inout)          :: self
+      class(elpa_autotune_t), intent(in), target :: tune_state
+      type(elpa_autotune_impl_t), pointer        :: ts_impl
+      character(*), intent(in)                   :: file_name
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(out) :: error
+#else
+      integer(kind=c_int), intent(out)           :: error
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        error = ELPA_OK
+      endif
+#else
+      error = ELPA_OK
+#endif
+      select type(tune_state)
+        type is (elpa_autotune_impl_t)
+          ts_impl => tune_state
+        class default
+          write(error_unit, *) "This should not happen! Critical error"
+#ifdef USE_FORTRAN2008
+          if (present(error)) then
+            error = ELPA_ERROR_CRITICAL
+          endif
+#else
+          error = ELPA_ERROR_CRITICAL
+#endif
+      end select
+
+
+      if (elpa_index_load_autotune_state_c(self%index, ts_impl%level, ts_impl%domain, ts_impl%min_loc, &
+                  ts_impl%min_val, ts_impl%current, ts_impl%cardinality, file_name // c_null_char) /= 1) then
+         write(error_unit, *) "This should not happen (in elpa_autotune_load_state())"
+#ifdef USE_FORTRAN2008
+         if (present(error)) then
+           error = ELPA_ERROR_CANNOT_OPEN_FILE
+         endif
+#else
+         error = ELPA_ERROR_CANNOT_OPEN_FILE
+#endif
+      endif
+    end subroutine
+
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_autotune_load_state method
+    !c> *
+    !c> *  \param  elpa_t           handle: of the ELPA object which should be tuned
+    !c> *  \param  elpa_autotune_t  autotune_handle: the autotuning object
+    !c> *  \param  error            int *
+    !c> *  \result none 
+    !c> */
+    !c> void elpa_autotune_load_state(elpa_t handle, elpa_autotune_t autotune_handle, const char *filename, int *error);
+    subroutine elpa_autotune_load_state_c(handle, autotune_handle, filename_p, error) bind(C, name="elpa_autotune_load_state")
+      type(c_ptr), intent(in), value       :: handle
+      type(c_ptr), intent(in), value       :: autotune_handle
+      type(elpa_impl_t), pointer           :: self
+      type(elpa_autotune_impl_t), pointer  :: tune_state
+      type(c_ptr), intent(in), value       :: filename_p
+      character(len=elpa_strlen_c(filename_p)), pointer :: filename
+      integer(kind=c_int)                  :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(filename_p, filename)
+      call c_f_pointer(autotune_handle, tune_state)
+
+      call self%autotune_load_state(tune_state, filename, error)
+
+    end subroutine
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_autotune_set_best method
+    !c> *
+    !c> *  \param  elpa_t           handle: of the ELPA object which should be tuned
+    !c> *  \param  elpa_autotune_t  autotune_handle: the autotuning object
+    !c> *  \param  error            int *
+    !c> *  \result none 
+    !c> */
+    !c> void elpa_autotune_set_best(elpa_t handle, elpa_autotune_t autotune_handle, int *error);
+    subroutine elpa_autotune_set_best_c(handle, autotune_handle, error) bind(C, name="elpa_autotune_set_best")
+      type(c_ptr), intent(in), value       :: handle
+      type(c_ptr), intent(in), value       :: autotune_handle
+      type(elpa_impl_t), pointer           :: self
+      type(elpa_autotune_impl_t), pointer  :: tune_state
+      integer(kind=c_int)                  :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(autotune_handle, tune_state)
+
+      call self%autotune_set_best(tune_state, error)
+
+    end subroutine
+
+
+    !c> /*! \brief C interface for the implementation of the elpa_autotune_print_best method
+    !c> *
+    !c> *  \param  elpa_t           handle: of the ELPA object which should be tuned
+    !c> *  \param  elpa_autotune_t  autotune_handle: the autotuning object
+    !c> *  \param  error            int *
+    !c> *  \result none 
+    !c> */
+    !c> void elpa_autotune_print_best(elpa_t handle, elpa_autotune_t autotune_handle, int *error);
+    subroutine elpa_autotune_print_best_c(handle, autotune_handle, error) bind(C, name="elpa_autotune_print_best")
+      type(c_ptr), intent(in), value       :: handle
+      type(c_ptr), intent(in), value       :: autotune_handle
+      type(elpa_impl_t), pointer           :: self
+      type(elpa_autotune_impl_t), pointer  :: tune_state
+      integer(kind=c_int)                  :: error
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(autotune_handle, tune_state)
+
+      call self%autotune_print_best(tune_state, error)
+
+    end subroutine
+
+#endif /* HAVE_AUTOTUNING */
+
+    function check_elpa(error, str, new_error) result(res)
+      integer, intent(inout) :: error
+      integer, intent(in)    :: new_error
+      character(*)  :: str
+      logical :: res
+      if (error .ne. ELPA_OK) then
+        print *, trim(str)
+        res = .true.
+        error = new_error
+        return
+      endif
+      res = .false.
+    end function
+
+    function check_elpa_get(error, new_error) result(res)
+      integer, intent(inout) :: error
+      integer, intent(in)    :: new_error
+      logical :: res
+      res = check_elpa(error, "Problem getting option. Aborting...", new_error)
+      return
+    end function
+
+    function check_elpa_set(error, new_error) result(res)
+      integer, intent(inout) :: error
+      integer, intent(in)    :: new_error
+      logical :: res
+      res = check_elpa(error, "Problem setting option. Aborting...", new_error)
+      return
+    end function
+
+    subroutine elpa_creating_from_legacy_api(self)
+      implicit none
+      class(elpa_impl_t), intent(inout)          :: self
+
+      self%from_legacy_api = 1
+    end subroutine
+end module
diff -Nru elpa-2016.05.001/src/elpa_impl_generalized_transform_template.F90 elpa-2019.11.001/src/elpa_impl_generalized_transform_template.F90
--- elpa-2016.05.001/src/elpa_impl_generalized_transform_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_impl_generalized_transform_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,212 @@
+! using elpa internal Hermitian multiply is faster then scalapack multiply, but we need an extra
+! temporary matrix.
+! using cannon algorithm should be the fastest. After this is verified, the other options should be removed
+! however, we need the extra temporary matrix as well.
+
+   subroutine elpa_transform_generalized_&
+            &ELPA_IMPL_SUFFIX&
+            &(self, a, b, is_already_decomposed, error)
+     use precision
+     implicit none
+#include "general/precision_kinds.F90"
+     class(elpa_impl_t)  :: self
+#ifdef USE_ASSUMED_SIZE
+     MATH_DATATYPE(kind=rck) :: a(self%local_nrows, *), b(self%local_nrows, *)
+#else
+     MATH_DATATYPE(kind=rck) :: a(self%local_nrows, self%local_ncols), b(self%local_nrows, self%local_ncols)
+#endif
+     integer                :: error
+     logical                :: is_already_decomposed
+     integer                :: sc_desc(SC_DESC_LEN)
+     integer(kind=ik)       :: my_p, my_prow, my_pcol, np_rows, np_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
+     integer(kind=MPI_KIND) :: my_pMPI, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+     integer(kind=ik)       :: BuffLevelInt, use_cannon
+     integer(kind=MPI_KIND) :: mpierr
+
+     MATH_DATATYPE(kind=rck) :: tmp(self%local_nrows, self%local_ncols)
+
+     call self%get("mpi_comm_rows",mpi_comm_rows,error)
+     call self%get("mpi_comm_cols",mpi_comm_cols,error)
+     call self%get("mpi_comm_parent", mpi_comm_all,error)
+
+     call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_pMPI, mpierr)
+     call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND),my_prowMPI, mpierr)
+     call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND),np_rowsMPI, mpierr)
+     call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND),my_pcolMPI, mpierr)
+     call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND),np_colsMPI, mpierr)
+
+     my_p = int(my_pMPI, kind=c_int)
+     my_prow = int(my_prowMPI, kind=c_int)
+     np_rows = int(np_rowsMPI, kind=c_int)
+     my_pcol = int(my_pcolMPI, kind=c_int)
+     np_cols = int(np_colsMPI, kind=c_int)
+
+     call self%timer_start("transform_generalized()")
+     call self%get("cannon_for_generalized",use_cannon,error)
+
+#if !defined(WITH_MPI)
+     if(my_p == 0) then
+       write(*,*) "Cannons algorithm can only be used with MPI"
+       write(*,*) "Switching to elpa Hermitian and scalapack"
+     end if
+     use_cannon = 0
+#endif
+
+     if (mod(np_cols, np_rows) /= 0) then
+       if(my_p == 0) then
+         write(*,*) "To use Cannons algorithm, np_cols must be a multiple of np_rows."
+         write(*,*) "Switching to elpa Hermitian and scalapack"
+       end if
+       use_cannon = 0
+     endif
+
+     error = self%construct_scalapack_descriptor(sc_desc, .false.)
+     if(error .NE. ELPA_OK) return
+
+     if (.not. is_already_decomposed) then
+       ! B = U^T*U, B<-U
+       call self%elpa_cholesky_&
+           &ELPA_IMPL_SUFFIX&
+           &(b, error)
+       if(error .NE. ELPA_OK) return
+       ! B <- inv(U)
+       call self%elpa_invert_trm_&
+           &ELPA_IMPL_SUFFIX&
+           &(b, error)
+       if(error .NE. ELPA_OK) return
+     end if
+
+     if(use_cannon == 1) then
+       call self%get("cannon_buffer_size",BuffLevelInt,error)
+       call self%timer_start("cannons_reduction")
+       ! BEWARE! even though tmp is output from the routine, it has to be zero on input!
+       tmp = 0.0_rck
+#ifdef WITH_MPI
+       call cannons_reduction_&
+         &ELPA_IMPL_SUFFIX&
+         &(a, b, self%local_nrows, self%local_ncols, &
+           int(sc_desc,kind=BLAS_KIND), tmp, int(BuffLevelInt,kind=MPI_KIND),                    &
+           int(mpi_comm_rows,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND))
+#endif
+       call self%timer_stop("cannons_reduction")
+
+       a(1:self%local_nrows, 1:self%local_ncols) = tmp(1:self%local_nrows, 1:self%local_ncols)
+
+     else  ! do not use cannon algorithm, use elpa hermitian multiply and scalapack instead
+       ! tmp <- inv(U^T) * A (we have to use temporary variable)
+       call self%elpa_hermitian_multiply_&
+           &ELPA_IMPL_SUFFIX&
+           &('U','F', self%na, b, a, self%local_nrows, self%local_ncols, tmp, &
+                                 self%local_nrows, self%local_ncols, error)
+       if(error .NE. ELPA_OK) return
+
+       ! A <- inv(U)^T * A
+       a(1:self%local_nrows, 1:self%local_ncols) = tmp(1:self%local_nrows, 1:self%local_ncols)
+
+       ! A <- inv(U)^T * A * inv(U)
+       ! For this multiplication we do not have internal function in ELPA,
+       ! so we have to call scalapack
+       call self%timer_start("scalapack multiply A * inv(U)")
+#ifdef WITH_MPI
+       call p&
+           &BLAS_CHAR&
+           &trmm("R", "U", "N", "N", int(self%na,kind=BLAS_KIND), int(self%na,kind=BLAS_KIND), &
+                 ONE, b, 1_BLAS_KIND, 1_BLAS_KIND, int(sc_desc,kind=BLAS_KIND), &
+                 a, 1_BLAS_KIND, 1_BLAS_KIND, int(sc_desc,kind=BLAS_KIND))
+#else
+       call BLAS_CHAR&
+           &trmm("R", "U", "N", "N", int(self%na,kind=BLAS_KIND), int(self%na,kind=BLAS_KIND), &
+                 ONE, b, int(self%na,kind=BLAS_KIND), a, int(self%na,kind=BLAS_KIND))
+#endif
+       call self%timer_stop("scalapack multiply A * inv(U)")
+     endif ! use_cannon
+
+     !write(*, *) my_prow, my_pcol, "A(2,3)", a(2,3)
+
+     call self%timer_stop("transform_generalized()")
+    end subroutine
+
+
+    subroutine elpa_transform_back_generalized_&
+            &ELPA_IMPL_SUFFIX&
+            &(self, b, q, error)
+        implicit none
+#include "general/precision_kinds.F90"
+        class(elpa_impl_t)  :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=rck) :: b(self%local_nrows, *), q(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=rck) :: b(self%local_nrows, self%local_ncols), q(self%local_nrows, self%local_ncols)
+#endif
+     integer(kind=ik)       :: my_p, my_prow, my_pcol, np_rows, np_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
+     integer(kind=MPI_KIND) :: mpierr, my_pMPI, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
+     integer                :: error
+     integer                :: sc_desc(SC_DESC_LEN)
+     integer                :: sc_desc_ev(SC_DESC_LEN)
+     integer(kind=ik)       :: use_cannon
+
+     MATH_DATATYPE(kind=rck) :: tmp(self%local_nrows, self%local_ncols)
+
+     call self%get("mpi_comm_rows",mpi_comm_rows,error)
+     call self%get("mpi_comm_cols",mpi_comm_cols,error)
+     call self%get("mpi_comm_parent", mpi_comm_all,error)
+
+     call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_pMPI,mpierr)
+     call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND),my_prowMPI,mpierr)
+     call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND),np_rowsMPI,mpierr)
+     call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND),my_pcolMPI,mpierr)
+     call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND),np_colsMPI,mpierr)
+
+     my_p = int(my_pMPI,kind=c_int)
+     my_prow = int(my_prowMPI,kind=c_int)
+     np_rows = int(np_rowsMPI,kind=c_int)
+     my_pcol = int(my_pcolMPI,kind=c_int)
+     np_cols = int(np_colsMPI,kind=c_int)
+
+     call self%timer_start("transform_back_generalized()")
+     call self%get("cannon_for_generalized",use_cannon,error)
+
+#if !defined(WITH_MPI)
+     use_cannon = 0
+#endif
+
+     if (mod(np_cols, np_rows) /= 0) then
+       use_cannon = 0
+     endif
+
+     error = self%construct_scalapack_descriptor(sc_desc, .false.)
+     error = self%construct_scalapack_descriptor(sc_desc_ev, .true.)
+     if(error .NE. ELPA_OK) return
+
+     if(use_cannon == 1) then
+       call self%timer_start("cannons_triang_rectangular")
+#ifdef WITH_MPI
+       call cannons_triang_rectangular_&
+         &ELPA_IMPL_SUFFIX&
+         &(b, q, self%local_nrows, self%local_ncols, &
+           int(sc_desc,kind=BLAS_KIND), int(sc_desc_ev,kind=BLAS_KIND), tmp,  &
+           int(mpi_comm_rows,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND) );
+#endif
+       call self%timer_stop("cannons_triang_rectangular")
+
+       q(1:self%local_nrows, 1:self%local_ncols) = tmp(1:self%local_nrows, 1:self%local_ncols)
+     else
+       call self%timer_start("scalapack multiply inv(U) * Q")
+#ifdef WITH_MPI
+       ! Q <- inv(U) * Q
+       call p&
+           &BLAS_CHAR&
+           &trmm("L", "U", "N", "N", int(self%na,kind=BLAS_KIND), int(self%nev,kind=BLAS_KIND), &
+                 ONE, b, 1_BLAS_KIND, 1_BLAS_KIND, int(sc_desc,kind=BLAS_KIND),  &
+                 q, 1_BLAS_KIND, 1_BLAS_KIND, int(sc_desc,kind=BLAS_KIND))
+#else
+       call BLAS_CHAR&
+           &trmm("L", "U", "N", "N", int(self%na,kind=BLAS_KIND), int(self%nev,kind=BLAS_KIND), &
+                 ONE, b, int(self%na,kind=BLAS_KIND), q, int(self%na,kind=BLAS_KIND))
+#endif
+       call self%timer_stop("scalapack multiply inv(U) * Q")
+     endif
+     call self%timer_stop("transform_back_generalized()")
+
+    end subroutine
+
diff -Nru elpa-2016.05.001/src/elpa_impl_math_template.F90 elpa-2019.11.001/src/elpa_impl_math_template.F90
--- elpa-2016.05.001/src/elpa_impl_math_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_impl_math_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1383 @@
+
+    !>  \brief elpa_eigenvectors_d: class method to solve the eigenvalue problem
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+    !>  blocksize, the number of eigenvectors
+    !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param a                                    Distributed matrix for which eigenvalues are to be computed.
+    !>                                              Distribution is like in Scalapack.
+    !>                                              The full matrix must be set (not only one half like in scalapack).
+    !>                                              Destroyed on exit (upper and lower half).
+    !>
+    !>  \param ev                                   On output: eigenvalues of a, every processor gets the complete set
+    !>
+    !>  \param q                                    On output: Eigenvectors of a
+    !>                                              Distribution is like in Scalapack.
+    !>                                              Must be always dimensioned to the full size (corresponding to (na,na))
+    !>                                              even if only a part of the eigenvalues is needed.
+    !>
+    !>  \param error                                integer, optional: returns an error code, which can be queried with elpa_strerr   
+
+    subroutine elpa_eigenvectors_&
+                    &ELPA_IMPL_SUFFIX&
+                    & (self, a, ev, q, error)
+      class(elpa_impl_t)  :: self
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *), q(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols), q(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+
+#ifdef USE_FORTRAN2008
+      integer, optional   :: error
+#else
+      integer             :: error
+#endif
+      integer             :: error2
+      integer(kind=c_int) :: solver
+      logical             :: success_l
+
+
+      call self%get("solver", solver,error2)
+      if (error2 .ne. ELPA_OK) then
+        print *,"Problem setting option. Aborting..."
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = error2
+        endif
+#else
+        error = error2
+#endif
+        return
+      endif
+      if (solver .eq. ELPA_SOLVER_1STAGE) then
+        call self%autotune_timer%start("accumulator")
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                &_1stage_&
+                &PRECISION&
+                &_impl(self, a, ev, q)
+#endif
+        call self%autotune_timer%stop("accumulator")
+
+      else if (solver .eq. ELPA_SOLVER_2STAGE) then
+        call self%autotune_timer%start("accumulator")
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                &_2stage_&
+                &PRECISION&
+                &_impl(self, a, ev, q)
+#endif
+        call self%autotune_timer%stop("accumulator")
+
+      else
+        print *,"unknown solver"
+        stop
+      endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in eigenvectors() and you did not check for errors!"
+      endif
+#else
+      if (success_l) then
+        error = ELPA_OK
+      else
+        error = ELPA_ERROR
+      endif
+#endif
+    end subroutine 
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+    !c> void elpa_eigenvectors_d(elpa_t handle, double *a, double *ev, double *q, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    !c> void elpa_eigenvectors_f(elpa_t handle, float *a, float *ev, float *q, int *error);
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> void elpa_eigenvectors_dc(elpa_t handle, double complex *a, double *ev, double complex *q, int *error);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    !c> void elpa_eigenvectors_fc(elpa_t handle, float complex *a, float *ev, float complex *q, int *error);
+#endif
+#endif
+    subroutine elpa_eigenvectors_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, a_p, ev_p, q_p, error) &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL  
+                    bind(C, name="elpa_eigenvectors_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL  
+                    bind(C, name="elpa_eigenvectors_f")
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+                    bind(C, name="elpa_eigenvectors_dc")
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+                    bind(C, name="elpa_eigenvectors_fc")
+#endif
+#endif
+      type(c_ptr), intent(in), value            :: handle, a_p, ev_p, q_p
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(in) :: error
+#else
+      integer(kind=c_int), intent(in)           :: error
+#endif
+
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer :: a(:, :), q(:, :)
+      real(kind=C_REAL_DATATYPE), pointer          :: ev(:)
+      type(elpa_impl_t), pointer                   :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+      call c_f_pointer(ev_p, ev, [self%na])
+      call c_f_pointer(q_p, q, [self%local_nrows, self%local_ncols])
+
+      call elpa_eigenvectors_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, a, ev, q, error)
+    end subroutine    
+
+#ifdef REALCASE 
+    !>  \brief elpa_skew_eigenvectors_d: class method to solve the real valued skew-symmetric eigenvalue problem
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+    !>  blocksize, the number of eigenvectors
+    !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param a                                    Distributed matrix for which eigenvalues are to be computed.
+    !>                                              Distribution is like in Scalapack.
+    !>                                              The full matrix must be set (not only one half like in scalapack).
+    !>                                              Destroyed on exit (upper and lower half).
+    !>
+    !>  \param ev                                   On output: eigenvalues of a, every processor gets the complete set
+    !>
+    !>  \param q                                    On output: Eigenvectors of a
+    !>                                              Distribution is like in Scalapack.
+    !>                                              Must be always dimensioned to the full size (corresponding to (na,na))
+    !>                                              even if only a part of the eigenvalues is needed.
+    !>
+    !>  \param error                                integer, optional: returns an error code, which can be queried with elpa_strerr   
+
+    subroutine elpa_skew_eigenvectors_&
+                    &ELPA_IMPL_SUFFIX&
+                    & (self, a, ev, q, error)
+      class(elpa_impl_t)  :: self
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *), q(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols)
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: q(self%local_nrows, 2*self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE)          :: ev(self%na)
+
+#ifdef USE_FORTRAN2008
+      integer, optional                   :: error
+#else
+      integer                             :: error
+#endif
+      integer                             :: error2
+      integer(kind=c_int)                 :: solver
+      logical                             :: success_l
+
+
+      call self%get("solver", solver,error2)
+      call self%set("is_skewsymmetric",1)
+      if (error2 .ne. ELPA_OK) then
+        print *,"Problem setting option. Aborting..."
+#ifdef USE_FORTRAN2008
+        if (present(error)) then
+          error = error2
+        endif
+#else
+        error = error2
+#endif
+        return
+      endif
+      if (solver .eq. ELPA_SOLVER_1STAGE) then
+        call self%autotune_timer%start("accumulator")
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                &_1stage_&
+                &PRECISION&
+                &_impl(self, a, ev, q)
+#endif
+        call self%autotune_timer%stop("accumulator")
+
+      else if (solver .eq. ELPA_SOLVER_2STAGE) then
+        call self%autotune_timer%start("accumulator")
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                &_2stage_&
+                &PRECISION&
+                &_impl(self, a, ev, q)
+#endif
+        call self%autotune_timer%stop("accumulator")
+
+      else
+        print *,"unknown solver"
+        stop
+      endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in skew_eigenvectors() and you did not check for errors!"
+      endif
+#else
+      if (success_l) then
+        error = ELPA_OK
+      else
+        error = ELPA_ERROR
+      endif
+#endif
+    end subroutine
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+    !c> void elpa_skew_eigenvectors_d(elpa_t handle, double *a, double *ev, double *q, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    !c> void elpa_skew_eigenvectors_f(elpa_t handle, float *a, float *ev, float *q, int *error);
+#endif
+#endif
+    subroutine elpa_skew_eigenvectors_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, a_p, ev_p, q_p, error) &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+                    bind(C, name="elpa_skew_eigenvectors_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL
+                    bind(C, name="elpa_skew_eigenvectors_f")
+#endif
+#endif
+
+      type(c_ptr), intent(in), value            :: handle, a_p, ev_p, q_p
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(in) :: error
+#else
+      integer(kind=c_int), intent(in)           :: error
+#endif
+
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer :: a(:, :), q(:, :)
+      real(kind=C_REAL_DATATYPE), pointer          :: ev(:)
+      type(elpa_impl_t), pointer                   :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+      call c_f_pointer(ev_p, ev, [self%na])
+      call c_f_pointer(q_p, q, [self%local_nrows, self%local_ncols])
+
+      call elpa_skew_eigenvectors_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, a, ev, q, error)
+    end subroutine
+#endif /* REALCASE */
+
+    !>  \brief elpa_eigenvalues_d: class method to solve the eigenvalue problem
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+    !>  blocksize, the number of eigenvectors
+    !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param a                                    Distributed matrix for which eigenvalues are to be computed.
+    !>                                              Distribution is like in Scalapack.
+    !>                                              The full matrix must be set (not only one half like in scalapack).
+    !>                                              Destroyed on exit (upper and lower half).
+    !>
+    !>  \param ev                                   On output: eigenvalues of a, every processor gets the complete set
+    !>
+    !>  \param error                                integer, optional: returns an error code, which can be queried with elpa_strerr
+    subroutine elpa_eigenvalues_&
+                    &ELPA_IMPL_SUFFIX&
+                    & (self, a, ev, error)
+      class(elpa_impl_t)  :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+#ifdef USE_FORTRAN2008
+      integer, optional   :: error
+#else
+      integer             :: error
+#endif
+      integer             :: error2
+      integer(kind=c_int) :: solver
+      logical             :: success_l
+
+
+      call self%get("solver", solver,error2)
+      if (error2 .ne. ELPA_OK) then
+         print *,"Problem getting option. Aborting..."
+#ifdef USE_FORTRAN2008
+         if (present(error)) then
+           error = error2
+         endif
+#else
+         error = error2
+#endif
+         return
+      endif
+
+      if (solver .eq. ELPA_SOLVER_1STAGE) then
+        call self%autotune_timer%start("accumulator")
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                          &_1stage_&
+                          &PRECISION&
+                          &_impl(self, a, ev)
+#endif
+        call self%autotune_timer%stop("accumulator")
+
+      else if (solver .eq. ELPA_SOLVER_2STAGE) then
+        call self%autotune_timer%start("accumulator")
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                                   &_2stage_&
+                                   &PRECISION&
+                                   &_impl(self, a, ev)
+#endif
+        call self%autotune_timer%stop("accumulator")
+
+      else
+        print *,"unknown solver"
+        stop
+      endif
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in eigenvalues() and you did not check for errors!"
+      endif
+#else
+      if (success_l) then
+        error = ELPA_OK
+      else
+        error = ELPA_ERROR
+      endif
+#endif
+    end subroutine
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+    !c> void elpa_eigenvalues_d(elpa_t handle, double *a, double *ev, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    !c> void elpa_eigenvalues_f(elpa_t handle, float *a, float *ev, int *error);
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> void elpa_eigenvalues_dc(elpa_t handle, double complex *a, double *ev, int *error);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    !c> void elpa_eigenvalues_fc(elpa_t handle, float complex *a, float *ev, int *error);
+#endif
+#endif
+    subroutine elpa_eigenvalues_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, a_p, ev_p, error) &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL    
+                    bind(C, name="elpa_eigenvalues_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL
+                    bind(C, name="elpa_eigenvalues_f")
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX 
+                    bind(C, name="elpa_eigenvalues_dc")
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+                    bind(C, name="elpa_eigenvalues_fc")
+#endif
+#endif
+
+      type(c_ptr), intent(in), value :: handle, a_p, ev_p
+      integer(kind=c_int), intent(in) :: error
+
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer :: a(:, :)
+      real(kind=C_REAL_DATATYPE), pointer :: ev(:)
+      type(elpa_impl_t), pointer  :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+      call c_f_pointer(ev_p, ev, [self%na])
+
+      call elpa_eigenvalues_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, a, ev, error)
+    end subroutine    
+
+#ifdef REALCASE
+    !>  \brief elpa_skew_eigenvalues_d: class method to solve the real valued skew-symmetric eigenvalue problem
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+    !>  blocksize, the number of eigenvectors
+    !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param a                                    Distributed matrix for which eigenvalues are to be computed.
+    !>                                              Distribution is like in Scalapack.
+    !>                                              The full matrix must be set (not only one half like in scalapack).
+    !>                                              Destroyed on exit (upper and lower half).
+    !>
+    !>  \param ev                                   On output: eigenvalues of a, every processor gets the complete set
+    !>
+    !>  \param error                                integer, optional: returns an error code, which can be queried with elpa_strerr
+    subroutine elpa_skew_eigenvalues_&
+                    &ELPA_IMPL_SUFFIX&
+                    & (self, a, ev, error)
+      class(elpa_impl_t)  :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE)          :: ev(self%na)
+#ifdef USE_FORTRAN2008
+      integer, optional                   :: error
+#else
+      integer                             :: error
+#endif
+      integer                             :: error2
+      integer(kind=c_int)                 :: solver
+      logical                             :: success_l
+
+      call self%get("solver", solver,error2)
+      call self%set("is_skewsymmetric",1)
+      if (error2 .ne. ELPA_OK) then
+         print *,"Problem getting option. Aborting..."
+#ifdef USE_FORTRAN2008
+         if (present(error)) then
+           error = error2
+         endif
+#else
+         error = error2
+#endif
+         return
+      endif
+
+      if (solver .eq. ELPA_SOLVER_1STAGE) then
+        call self%autotune_timer%start("accumulator")
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                          &_1stage_&
+                          &PRECISION&
+                          &_impl(self, a, ev)
+#endif
+        call self%autotune_timer%stop("accumulator")
+
+      else if (solver .eq. ELPA_SOLVER_2STAGE) then
+        call self%autotune_timer%start("accumulator")
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                                   &_2stage_&
+                                   &PRECISION&
+                                   &_impl(self, a, ev)
+#endif
+        call self%autotune_timer%stop("accumulator")
+
+      else
+        print *,"unknown solver"
+        stop
+      endif
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in skew_eigenvalues() and you did not check for errors!"
+      endif
+#else
+      if (success_l) then
+        error = ELPA_OK
+      else
+        error = ELPA_ERROR
+      endif
+#endif
+    end subroutine
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+    !c> void elpa_skew_eigenvalues_d(elpa_t handle, double *a, double *ev, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    !c> void elpa_skew_eigenvalues_f(elpa_t handle, float *a, float *ev, int *error);
+#endif
+#endif
+    subroutine elpa_skew_eigenvalues_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, a_p, ev_p, error) &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+                    bind(C, name="elpa_skew_eigenvalues_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL
+                    bind(C, name="elpa_skew_eigenvalues_f")
+#endif
+#endif
+      type(c_ptr), intent(in), value :: handle, a_p, ev_p
+      integer(kind=c_int), intent(in) :: error
+
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer :: a(:, :)
+      real(kind=C_REAL_DATATYPE), pointer :: ev(:)
+      type(elpa_impl_t), pointer  :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+      call c_f_pointer(ev_p, ev, [self%na])
+
+      call elpa_skew_eigenvalues_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, a, ev, error)
+    end subroutine
+#endif /* REALCASE */
+
+    !>  \brief elpa_generalized_eigenvectors_d: class method to solve the eigenvalue problem
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+    !>  blocksize, the number of eigenvectors
+    !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param a                                    Distributed matrix for which eigenvalues are to be computed.
+    !>                                              Distribution is like in Scalapack.
+    !>                                              The full matrix must be set (not only one half like in scalapack).
+    !>                                              Destroyed on exit (upper and lower half).
+    !>
+    !>  \param b                                    Distributed matrix, part of the generalized eigenvector problem, or the
+    !>                                              product of a previous call to this function (see is_already_decomposed).
+    !>                                              Distribution is like in Scalapack.
+    !>                                              If is_already_decomposed is false, on exit replaced by the decomposition
+    !>
+    !>  \param ev                                   On output: eigenvalues of a, every processor gets the complete set
+    !>
+    !>  \param q                                    On output: Eigenvectors of a
+    !>                                              Distribution is like in Scalapack.
+    !>                                              Must be always dimensioned to the full size (corresponding to (na,na))
+    !>                                              even if only a part of the eigenvalues is needed.
+    !>
+    !>  \param is_already_decomposed                has to be set to .false. for the first call with a given b and .true. for
+    !>                                              each subsequent call with the same b, since b then already contains
+    !>                                              decomposition and thus the decomposing step is skipped
+    !>
+    !>  \param error                                integer, optional: returns an error code, which can be queried with elpa_strerr 
+    subroutine elpa_generalized_eigenvectors_&
+                    &ELPA_IMPL_SUFFIX&
+                    & (self, a, b, ev, q, is_already_decomposed, error)
+      use elpa2_impl
+      use elpa1_impl
+      use elpa_utilities, only : error_unit
+      use iso_c_binding
+      class(elpa_impl_t)  :: self
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *), b(self%local_nrows, *), q(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols), b(self%local_nrows, self%local_ncols), &
+                             q(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+      logical             :: is_already_decomposed
+
+      integer, optional   :: error
+      integer             :: error_l
+      integer(kind=c_int) :: solver
+      logical             :: success_l
+
+#if defined(INCLUDE_ROUTINES)
+      call self%elpa_transform_generalized_&
+              &ELPA_IMPL_SUFFIX&
+              & (a, b, is_already_decomposed, error_l)
+#endif
+      if (present(error)) then
+          error = error_l
+      else if (error_l .ne. ELPA_OK) then
+        write(error_unit,'(a)') "ELPA: Error in transform_generalized() and you did not check for errors!"
+      endif
+
+      call self%get("solver", solver,error_l)
+      if (solver .eq. ELPA_SOLVER_1STAGE) then
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                &_1stage_&
+                &PRECISION&
+                &_impl(self, a, ev, q)
+#endif
+      else if (solver .eq. ELPA_SOLVER_2STAGE) then
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                &_2stage_&
+                &PRECISION&
+                &_impl(self, a, ev, q)
+#endif
+      else
+        print *,"unknown solver"
+        stop
+      endif
+
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in solve() and you did not check for errors!"
+      endif
+
+#if defined(INCLUDE_ROUTINES)
+      call self%elpa_transform_back_generalized_&
+              &ELPA_IMPL_SUFFIX&
+              & (b, q, error_l)
+#endif
+      if (present(error)) then
+          error = error_l
+      else if (error_l .ne. ELPA_OK) then
+        write(error_unit,'(a)') "ELPA: Error in transform_back_generalized() and you did not check for errors!"
+      endif
+    end subroutine
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL  
+    !c> void elpa_generalized_eigenvectors_d(elpa_t handle, double *a, double *b, double *ev, double *q,
+    !c> int is_already_decomposed, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL  
+    !c> void elpa_generalized_eigenvectors_f(elpa_t handle, float *a, float *b, float *ev, float *q,
+    !c> int is_already_decomposed, int *error);
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> void elpa_generalized_eigenvectors_dc(elpa_t handle, double complex *a, double complex *b, double *ev, double complex *q,
+    !c> int is_already_decomposed, int *error);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    !c> void elpa_generalized_eigenvectors_fc(elpa_t handle, float complex *a, float complex *b, float *ev, float complex *q,
+    !c> int is_already_decomposed, int *error);
+#endif
+#endif
+    subroutine elpa_generalized_eigenvectors_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, a_p, b_p, ev_p, q_p, is_already_decomposed, error) &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL 
+                                                            bind(C, name="elpa_generalized_eigenvectors_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL
+                                                            bind(C, name="elpa_generalized_eigenvectors_f")
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+                                                            bind(C, name="elpa_generalized_eigenvectors_dc")
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+                                                            bind(C, name="elpa_generalized_eigenvectors_fc")
+#endif
+#endif
+      type(c_ptr), intent(in), value :: handle, a_p, b_p, ev_p, q_p
+      integer(kind=c_int), intent(in), value :: is_already_decomposed
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(in) :: error
+#else
+      integer(kind=c_int), intent(in) :: error
+#endif
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer :: a(:, :), b(:, :), q(:, :)
+      real(kind=C_REAL_DATATYPE), pointer :: ev(:)
+      logical :: is_already_decomposed_fortran
+      type(elpa_impl_t), pointer  :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+      call c_f_pointer(b_p, b, [self%local_nrows, self%local_ncols])
+      call c_f_pointer(ev_p, ev, [self%na])
+      call c_f_pointer(q_p, q, [self%local_nrows, self%local_ncols])
+      if(is_already_decomposed .eq. 0) then
+        is_already_decomposed_fortran = .false.
+      else
+        is_already_decomposed_fortran = .true.
+      end if
+
+      call elpa_generalized_eigenvectors_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, a, b, ev, q, is_already_decomposed_fortran, error)
+    end subroutine
+
+    
+
+    !>  \brief elpa_generalized_eigenvalues_d: class method to solve the eigenvalue problem
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cyclic distribution
+    !>  blocksize, the number of eigenvectors
+    !>  to be computed and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param a                                    Distributed matrix for which eigenvalues are to be computed.
+    !>                                              Distribution is like in Scalapack.
+    !>                                              The full matrix must be set (not only one half like in scalapack).
+    !>                                              Destroyed on exit (upper and lower half).
+    !>
+    !>  \param b                                    Distributed matrix, part of the generalized eigenvector problem, or the
+    !>                                              product of a previous call to this function (see is_already_decomposed).
+    !>                                              Distribution is like in Scalapack.
+    !>                                              If is_already_decomposed is false, on exit replaced by the decomposition
+    !>
+    !>  \param ev                                   On output: eigenvalues of a, every processor gets the complete set
+    !>
+    !>  \param is_already_decomposed                has to be set to .false. for the first call with a given b and .true. for
+    !>                                              each subsequent call with the same b, since b then already contains
+    !>                                              decomposition and thus the decomposing step is skipped
+    !>
+    !>  \param error                                integer, optional: returns an error code, which can be queried with elpa_strerr
+    subroutine elpa_generalized_eigenvalues_&
+                    &ELPA_IMPL_SUFFIX&
+                    & (self, a, b, ev, is_already_decomposed, error)
+      use elpa2_impl
+      use elpa1_impl
+      use elpa_utilities, only : error_unit
+      use iso_c_binding
+      class(elpa_impl_t)  :: self
+
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, *), b(self%local_nrows, *)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows, self%local_ncols), b(self%local_nrows, self%local_ncols)
+#endif
+      real(kind=C_REAL_DATATYPE) :: ev(self%na)
+      logical             :: is_already_decomposed
+
+      integer, optional   :: error
+      integer             :: error_l
+      integer(kind=c_int) :: solver
+      logical             :: success_l
+
+#if defined(INCLUDE_ROUTINES)
+      call self%elpa_transform_generalized_&
+              &ELPA_IMPL_SUFFIX&
+              & (a, b, is_already_decomposed, error_l)
+#endif
+      if (present(error)) then
+          error = error_l
+      else if (error_l .ne. ELPA_OK) then
+        write(error_unit,'(a)') "ELPA: Error in transform_generalized() and you did not check for errors!"
+      endif
+
+      call self%get("solver", solver,error_l)
+      if (solver .eq. ELPA_SOLVER_1STAGE) then
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                &_1stage_&
+                &PRECISION&
+                &_impl(self, a, ev)
+#endif
+      else if (solver .eq. ELPA_SOLVER_2STAGE) then
+#if defined(INCLUDE_ROUTINES)
+        success_l = elpa_solve_evp_&
+                &MATH_DATATYPE&
+                &_2stage_&
+                &PRECISION&
+                &_impl(self, a, ev)
+#endif
+      else
+        print *,"unknown solver"
+        stop
+      endif
+
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in solve() and you did not check for errors!"
+      endif
+
+    end subroutine
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL  
+    !c> void elpa_generalized_eigenvalues_d(elpa_t handle, double *a, double *b, double *ev,
+    !c> int is_already_decomposed, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL  
+    !c> void elpa_generalized_eigenvalues_f(elpa_t handle, float *a, float *b, float *ev,
+    !c> int is_already_decomposed, int *error);
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> void elpa_generalized_eigenvalues_dc(elpa_t handle, double complex *a, double complex *b, double *ev,
+    !c> int is_already_decomposed, int *error);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    !c> void elpa_generalized_eigenvalues_fc(elpa_t handle, float complex *a, float complex *b, float *ev,
+    !c> int is_already_decomposed, int *error);
+#endif
+#endif
+    subroutine elpa_generalized_eigenvalues_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, a_p, b_p, ev_p, is_already_decomposed, error) &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL 
+                                                            bind(C, name="elpa_generalized_eigenvalues_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL
+                                                            bind(C, name="elpa_generalized_eigenvalues_f")
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+                                                            bind(C, name="elpa_generalized_eigenvalues_dc")
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+                                                            bind(C, name="elpa_generalized_eigenvalues_fc")
+#endif
+#endif
+      type(c_ptr), intent(in), value :: handle, a_p, b_p, ev_p
+      integer(kind=c_int), intent(in), value :: is_already_decomposed
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(in) :: error
+#else
+      integer(kind=c_int), intent(in) :: error
+#endif
+
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer :: a(:, :), b(:, :)
+      real(kind=C_REAL_DATATYPE), pointer :: ev(:)
+      logical :: is_already_decomposed_fortran
+      type(elpa_impl_t), pointer  :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+      call c_f_pointer(b_p, b, [self%local_nrows, self%local_ncols])
+      call c_f_pointer(ev_p, ev, [self%na])
+      if(is_already_decomposed .eq. 0) then
+        is_already_decomposed_fortran = .false.
+      else
+        is_already_decomposed_fortran = .true.
+      end if
+
+      call elpa_generalized_eigenvalues_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, a, b, ev, is_already_decomposed_fortran, error)
+    end subroutine
+
+
+    !> \brief  elpa_hermitian_multiply_d: class method to perform C : = A**T * B
+    !>         where   A is a square matrix (self%na,self%na) which is optionally upper or lower triangular
+    !>                 B is a (self%na,ncb) matrix
+    !>                 C is a (self%na,ncb) matrix where optionally only the upper or lower
+    !>                   triangle may be computed
+    !>
+    !> the MPI commicators and the block-cyclic distribution block size are already known to the type.
+    !> Thus the class method "setup" must be called BEFORE this method is used
+    !>
+    !> \details
+    !>
+    !> \param  self                 class(elpa_t), the ELPA object
+    !> \param  uplo_a               'U' if A is upper triangular
+    !>                              'L' if A is lower triangular
+    !>                              anything else if A is a full matrix
+    !>                              Please note: This pertains to the original A (as set in the calling program)
+    !>                                           whereas the transpose of A is used for calculations
+    !>                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
+    !>                              i.e. it may contain arbitrary numbers
+    !> \param uplo_c                'U' if only the upper diagonal part of C is needed
+    !>                              'L' if only the upper diagonal part of C is needed
+    !>                              anything else if the full matrix C is needed
+    !>                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
+    !>                                            written to a certain extent, i.e. one shouldn't rely on the content there!
+    !> \param ncb                   Number of columns  of global matrices B and C
+    !> \param a                     matrix a
+    !> \param local_nrows           number of rows of local (sub) matrix a, set with class method set("local_nrows",value)
+    !> \param local_ncols           number of columns of local (sub) matrix a, set with class method set("local_ncols",value)
+    !> \param b                     matrix b
+    !> \param nrows_b               number of rows of local (sub) matrix b
+    !> \param ncols_b               number of columns of local (sub) matrix b
+    !> \param c                     matrix c
+    !> \param nrows_c               number of rows of local (sub) matrix c
+    !> \param ncols_c               number of columns of local (sub) matrix c
+    !> \param error                 optional argument, error code which can be queried with elpa_strerr
+    subroutine elpa_hermitian_multiply_&
+                   &ELPA_IMPL_SUFFIX&
+                   & (self, uplo_a, uplo_c, ncb, a, b, nrows_b, ncols_b, &
+                                          c, nrows_c, ncols_c, error)
+      class(elpa_impl_t)              :: self
+      character*1                     :: uplo_a, uplo_c
+      integer(kind=c_int), intent(in) :: nrows_b, ncols_b, nrows_c, ncols_c, ncb
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows,*), b(nrows_b,*), c(nrows_c,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND) :: a(self%local_nrows,self%local_ncols), b(nrows_b,ncols_b), c(nrows_c,ncols_c)
+#endif
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+      logical                         :: success_l
+
+#if defined(INCLUDE_ROUTINES)
+#ifdef REALCASE
+      success_l = elpa_mult_at_b_&
+#endif
+#ifdef COMPLEXCASE
+      success_l = elpa_mult_ah_b_&
+#endif
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION&
+              &_impl(self, uplo_a, uplo_c, ncb, a, b, nrows_b, ncols_b, &
+                                                  c, nrows_c, ncols_c)
+#endif
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in hermitian_multiply() and you did not check for errors!"
+      endif
+#else
+      if (success_l) then
+        error = ELPA_OK
+      else
+        error = ELPA_ERROR
+      endif
+#endif
+    end subroutine  
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL    
+    !c> void elpa_hermitian_multiply_d(elpa_t handle, char uplo_a, char uplo_c, int ncb, double *a, double *b, int nrows_b, int ncols_b, double *c, int nrows_c, int ncols_c, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    !c> void elpa_hermitian_multiply_df(elpa_t handle, char uplo_a, char uplo_c, int ncb, float *a, float *b, int nrows_b, int ncols_b, float *c, int nrows_c, int ncols_c, int *error);
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX    
+    !c> void elpa_hermitian_multiply_dc(elpa_t handle, char uplo_a, char uplo_c, int ncb, double complex *a, double complex *b, int nrows_b, int ncols_b, double complex *c, int nrows_c, int ncols_c, int *error);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    !c> void elpa_hermitian_multiply_fc(elpa_t handle, char uplo_a, char uplo_c, int ncb, float complex *a, float complex *b, int nrows_b, int ncols_b, float complex *c, int nrows_c, int ncols_c, int *error);
+#endif
+#endif
+    subroutine elpa_hermitian_multiply_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, uplo_a, uplo_c, ncb, a_p, b, nrows_b, &
+                                           ncols_b, c, nrows_c, ncols_c, error)          &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL 
+                                           bind(C, name="elpa_hermitian_multiply_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL 
+                                           bind(C, name="elpa_hermitian_multiply_f")
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+                                           bind(C, name="elpa_hermitian_multiply_dc")
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX 
+                                           bind(C, name="elpa_hermitian_multiply_fc")
+#endif
+#endif
+
+      type(c_ptr), intent(in), value            :: handle, a_p
+      character(1,C_CHAR), value                :: uplo_a, uplo_c
+      integer(kind=c_int), value                :: ncb, nrows_b, ncols_b, nrows_c, ncols_c
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(in) :: error
+#else
+      integer(kind=c_int), intent(in)           :: error
+#endif
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer              :: a(:, :)
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND)                       :: b(nrows_b,*), c(nrows_c,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND)                       :: b(nrows_b,ncols_b), c(nrows_c,ncols_c)
+#endif
+      type(elpa_impl_t), pointer                :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+
+      call elpa_hermitian_multiply_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, uplo_a, uplo_c, ncb, a, b, nrows_b, &
+                                     ncols_b, c, nrows_c, ncols_c, error)
+    end subroutine
+
+
+    !>  \brief elpa_choleksy_d: class method to do a cholesky factorization
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cylic-distribution
+    !>  block size, and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param a                                    Distributed matrix for which eigenvalues are to be computed.
+    !>                                              Distribution is like in Scalapack.
+    !>                                              The full matrix must be set (not only one half like in scalapack).
+    !>                                              Destroyed on exit (upper and lower half).
+    !>
+    !>  \param error                                integer, optional: returns an error code, which can be queried with elpa_strerr
+    subroutine elpa_cholesky_&
+                   &ELPA_IMPL_SUFFIX&
+                   & (self, a, error)
+      class(elpa_impl_t)              :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND)                  :: a(self%local_nrows,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND)                  :: a(self%local_nrows,self%local_ncols)
+#endif
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+      logical                         :: success_l
+
+#if defined(INCLUDE_ROUTINES)
+      success_l = elpa_cholesky_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION&
+              &_impl (self, a)
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in cholesky() and you did not check for errors!"
+      endif
+#else
+      if (success_l) then
+        error = ELPA_OK
+      else
+        error = ELPA_ERROR
+      endif
+#endif
+    end subroutine    
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+    !c> void elpa_cholesky_d(elpa_t handle, double *a, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL
+    !c> void elpa_cholesky_f(elpa_t handle, float *a, int *error);
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> void elpa_cholesky_dc(elpa_t handle, double complex *a, int *error);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+    !c> void elpa_cholesky_fc(elpa_t handle, float complex *a, int *error);
+#endif
+#endif
+    subroutine elpa_choleksy_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, a_p, error) &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+                    bind(C, name="elpa_cholesky_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL
+                    bind(C, name="elpa_cholesky_f")
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+                    bind(C, name="elpa_cholesky_dc")
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+                    bind(C, name="elpa_cholesky_fc")
+#endif
+#endif
+
+      type(c_ptr), intent(in), value            :: handle, a_p
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(in) :: error
+#else
+      integer(kind=c_int), intent(in)           :: error
+#endif
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer              :: a(:, :)
+      type(elpa_impl_t), pointer                :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+
+      call elpa_cholesky_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, a, error)
+    end subroutine      
+
+
+    !>  \brief elpa_invert_trm_d: class method to invert a triangular
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cylic-distribution
+    !>  block size, and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param a                                    Distributed matrix for which eigenvalues are to be computed.
+    !>                                              Distribution is like in Scalapack.
+    !>                                              The full matrix must be set (not only one half like in scalapack).
+    !>                                              Destroyed on exit (upper and lower half).
+    !>
+    !>  \param error                                integer, optional: returns an error code, which can be queried with elpa_strerr
+    subroutine elpa_invert_trm_&
+                   &ELPA_IMPL_SUFFIX&
+                  & (self, a, error)
+      class(elpa_impl_t)              :: self
+#ifdef USE_ASSUMED_SIZE
+      MATH_DATATYPE(kind=C_DATATYPE_KIND)             :: a(self%local_nrows,*)
+#else
+      MATH_DATATYPE(kind=C_DATATYPE_KIND)             :: a(self%local_nrows,self%local_ncols)
+#endif
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+      logical                         :: success_l
+
+#if defined(INCLUDE_ROUTINES)
+      success_l = elpa_invert_trm_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION&
+              &_impl (self, a)
+#endif
+
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in invert_trm() and you did not check for errors!"
+      endif
+#else
+      if (success_l) then
+        error = ELPA_OK
+      else
+        error = ELPA_ERROR
+      endif
+#endif
+    end subroutine   
+
+
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL      
+    !c> void elpa_invert_trm_d(elpa_t handle, double *a, int *error);
+#endif
+#ifdef SINGLE_PRECISION_REAL      
+    !c> void elpa_invert_trm_f(elpa_t handle, float *a, int *error);
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX     
+    !c> void elpa_invert_trm_dc(elpa_t handle, double complex *a, int *error);
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX      
+    !c> void elpa_invert_trm_fc(elpa_t handle, float complex *a, int *error);
+#endif
+#endif
+    subroutine elpa_invert_trm_&
+                    &ELPA_IMPL_SUFFIX&
+                    &_c(handle, a_p, error) &
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+                    bind(C, name="elpa_invert_trm_d")
+#endif
+#ifdef SINGLE_PRECISION_REAL
+                    bind(C, name="elpa_invert_trm_f")
+#endif
+#endif
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+                    bind(C, name="elpa_invert_trm_dc")
+#endif
+#ifdef SINGLE_PRECISION_COMPLEX
+                    bind(C, name="elpa_invert_trm_fc")
+#endif
+#endif
+
+      type(c_ptr), intent(in), value            :: handle, a_p
+#ifdef USE_FORTRAN2008
+      integer(kind=c_int), optional, intent(in) :: error
+#else
+      integer(kind=c_int), intent(in)           :: error
+#endif
+      MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer              :: a(:, :)
+      type(elpa_impl_t), pointer                :: self
+
+      call c_f_pointer(handle, self)
+      call c_f_pointer(a_p, a, [self%local_nrows, self%local_ncols])
+
+      call elpa_invert_trm_&
+              &ELPA_IMPL_SUFFIX&
+              & (self, a, error)
+    end subroutine
+
+
+    !>  \brief elpa_solve_tridiagonal_d: class method to solve the eigenvalue problem for a tridiagonal matrix a
+    !>
+    !>  The dimensions of the matrix a (locally ditributed and global), the block-cylic-distribution
+    !>  block size, and the MPI communicators are already known to the object and MUST be set BEFORE
+    !>  with the class method "setup"
+    !>
+    !>  It is possible to change the behaviour of the method by setting tunable parameters with the
+    !>  class method "set"
+    !>
+    !>  Parameters
+    !>
+    !>  \param d        array d  on input diagonal elements of tridiagonal matrix, on
+    !>                           output the eigenvalues in ascending order
+    !>  \param e        array e on input subdiagonal elements of matrix, on exit destroyed
+    !>  \param q        matrix  on exit : contains the eigenvectors
+    !>  \param error    integer, optional: returns an error code, which can be queried with elpa_strerr 
+    subroutine elpa_solve_tridiagonal_&
+                   &ELPA_IMPL_SUFFIX&
+                   & (self, d, e, q, error)
+      class(elpa_impl_t)              :: self
+      real(kind=C_REAL_DATATYPE)                  :: d(self%na), e(self%na)
+#ifdef USE_ASSUMED_SIZE
+      real(kind=C_REAL_DATATYPE)                  :: q(self%local_nrows,*)
+#else
+      real(kind=C_REAL_DATATYPE)                  :: q(self%local_nrows,self%local_ncols)
+#endif
+#ifdef USE_FORTRAN2008
+      integer, optional               :: error
+#else
+      integer                         :: error
+#endif
+      logical                         :: success_l
+
+#if defined(INCLUDE_ROUTINES)
+      success_l = elpa_solve_tridi_&
+              &PRECISION&
+              &_impl(self, d, e, q)
+#else
+     print *,"ELPA is not compiled with single-precision support"
+     stop
+#endif
+#ifdef USE_FORTRAN2008
+      if (present(error)) then
+        if (success_l) then
+          error = ELPA_OK
+        else
+          error = ELPA_ERROR
+        endif
+      else if (.not. success_l) then
+        write(error_unit,'(a)') "ELPA: Error in solve_tridiagonal() and you did not check for errors!"
+      endif
+#else
+      if (success_l) then
+        error = ELPA_OK
+      else
+        error = ELPA_ERROR
+      endif
+#endif
+    end subroutine   
+
diff -Nru elpa-2016.05.001/src/elpa_index.c elpa-2019.11.001/src/elpa_index.c
--- elpa-2016.05.001/src/elpa_index.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_index.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1407 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+//    Authors: L. Huedepohl and A. Marek, MPCDF
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <elpa/elpa.h>
+#include "elpa_index.h"
+
+#include "config.h"
+
+#ifdef WITH_OPENMP
+#include <omp.h>
+#endif
+
+int max_threads_glob;
+int set_max_threads_glob=0;
+
+static int enumerate_identity(elpa_index_t index, int i);
+static int cardinality_bool(elpa_index_t index);
+static int valid_bool(elpa_index_t index, int n, int new_value);
+
+static int number_of_solvers(elpa_index_t index);
+static int solver_enumerate(elpa_index_t index, int i);
+static int solver_is_valid(elpa_index_t index, int n, int new_value);
+static const char* elpa_solver_name(int solver);
+
+static int number_of_real_kernels(elpa_index_t index);
+static int real_kernel_enumerate(elpa_index_t index, int i);
+static int real_kernel_is_valid(elpa_index_t index, int n, int new_value);
+static const char *real_kernel_name(int kernel);
+
+static int number_of_complex_kernels(elpa_index_t index);
+static int complex_kernel_enumerate(elpa_index_t index, int i);
+static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value);
+static const char *complex_kernel_name(int kernel);
+
+static int band_to_full_cardinality(elpa_index_t index);
+static int band_to_full_enumerate(elpa_index_t index, int i);
+static int band_to_full_is_valid(elpa_index_t index, int n, int new_value);
+
+static int stripewidth_real_cardinality(elpa_index_t index);
+static int stripewidth_real_enumerate(elpa_index_t index, int i);
+static int stripewidth_real_is_valid(elpa_index_t index, int n, int new_value);
+
+static int stripewidth_complex_cardinality(elpa_index_t index);
+static int stripewidth_complex_enumerate(elpa_index_t index, int i);
+static int stripewidth_complex_is_valid(elpa_index_t index, int n, int new_value);
+
+static int omp_threads_cardinality(elpa_index_t index);
+static int omp_threads_enumerate(elpa_index_t index, int i);
+static int omp_threads_is_valid(elpa_index_t index, int n, int new_value);
+
+static int max_stored_rows_cardinality(elpa_index_t index);
+static int max_stored_rows_enumerate(elpa_index_t index, int i);
+static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value);
+
+static int min_tile_size_cardinality(elpa_index_t index);
+static int min_tile_size_enumerate(elpa_index_t index, int i);
+static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value);
+
+static int valid_with_gpu(elpa_index_t index, int n, int new_value);
+static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value);
+static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value);
+
+static int intermediate_bandwidth_cardinality(elpa_index_t index);
+static int intermediate_bandwidth_enumerate(elpa_index_t index, int i);
+static int intermediate_bandwidth_is_valid(elpa_index_t index, int n, int new_value);
+
+static int cannon_buffer_size_cardinality(elpa_index_t index);
+static int cannon_buffer_size_enumerate(elpa_index_t index, int i);
+static int cannon_buffer_size_is_valid(elpa_index_t index, int n, int new_value);
+
+static int na_is_valid(elpa_index_t index, int n, int new_value);
+static int nev_is_valid(elpa_index_t index, int n, int new_value);
+static int bw_is_valid(elpa_index_t index, int n, int new_value);
+static int output_build_config_is_valid(elpa_index_t index, int n, int new_value);
+static int gpu_is_valid(elpa_index_t index, int n, int new_value);
+static int skewsymmetric_is_valid(elpa_index_t index, int n, int new_value);
+
+static int is_positive(elpa_index_t index, int n, int new_value);
+
+static int elpa_double_string_to_value(char *name, char *string, double *value);
+static int elpa_double_value_to_string(char *name, double value, const char **string);
+
+#define BASE_ENTRY(option_name, option_description, once_value, readonly_value, print_flag_value) \
+                .base = { \
+                        .name = option_name, \
+                        .description = option_description, \
+                        .once = once_value, \
+                        .readonly = readonly_value, \
+                        .env_default = "ELPA_DEFAULT_" option_name, \
+                        .env_force = "ELPA_FORCE_" option_name, \
+                        .print_flag = print_flag_value, \
+                }
+
+#define INT_PARAMETER_ENTRY(option_name, option_description, valid_func, print_flag) \
+        { \
+                BASE_ENTRY(option_name, option_description, 1, 0, print_flag), \
+                .valid = valid_func, \
+        }
+
+#define BOOL_ENTRY(option_name, option_description, default, tune_level, tune_domain, print_flag) \
+        { \
+                BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
+                .default_value = default, \
+                .autotune_level = tune_level, \
+                .autotune_domain = tune_domain, \
+                .cardinality = cardinality_bool, \
+                .enumerate = enumerate_identity, \
+                .valid = valid_bool, \
+        }
+
+#define INT_ENTRY(option_name, option_description, default, tune_level, tune_domain, card_func, enumerate_func, valid_func, to_string_func, print_flag) \
+        { \
+                BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
+                .default_value = default, \
+                .autotune_level = tune_level, \
+                .autotune_domain = tune_domain, \
+                .cardinality = card_func, \
+                .enumerate = enumerate_func, \
+                .valid = valid_func, \
+                .to_string = to_string_func, \
+        }
+
+#define INT_ANY_ENTRY(option_name, option_description, print_flag) \
+        { \
+                BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
+        }
+
+/* The order here is important! Tunable options that are dependent on other
+ * tunable options must appear later in the list than their prerequisites */
+static const elpa_index_int_entry_t int_entries[] = {
+        INT_PARAMETER_ENTRY("na", "Global matrix has size (na * na)", na_is_valid, PRINT_STRUCTURE),
+        INT_PARAMETER_ENTRY("nev", "Number of eigenvectors to be computed, 0 <= nev <= na", nev_is_valid, PRINT_STRUCTURE),
+        INT_PARAMETER_ENTRY("nblk", "Block size of scalapack block-cyclic distribution", is_positive, PRINT_STRUCTURE),
+        INT_PARAMETER_ENTRY("local_nrows", "Number of matrix rows stored on this process", NULL, PRINT_NO),
+        INT_PARAMETER_ENTRY("local_ncols", "Number of matrix columns stored on this process", NULL, PRINT_NO),
+        INT_PARAMETER_ENTRY("process_row", "Process row number in the 2D domain decomposition", NULL, PRINT_NO),
+        INT_PARAMETER_ENTRY("process_col", "Process column number in the 2D domain decomposition", NULL, PRINT_NO),
+        INT_PARAMETER_ENTRY("process_id", "Process rank", NULL, PRINT_NO),
+        INT_PARAMETER_ENTRY("num_process_rows", "Number of process row number in the 2D domain decomposition", NULL, PRINT_STRUCTURE),
+        INT_PARAMETER_ENTRY("num_process_cols", "Number of process column number in the 2D domain decomposition", NULL, PRINT_STRUCTURE),
+        INT_PARAMETER_ENTRY("num_processes", "Total number of processes", NULL, PRINT_STRUCTURE),
+        INT_PARAMETER_ENTRY("bandwidth", "If specified, a band matrix with this bandwidth is expected as input; bandwidth must be multiply of nblk", bw_is_valid, PRINT_YES),
+        INT_ANY_ENTRY("mpi_comm_rows", "Communicator for inter-row communication", PRINT_NO),
+        INT_ANY_ENTRY("mpi_comm_cols", "Communicator for inter-column communication", PRINT_NO),
+        INT_ANY_ENTRY("mpi_comm_parent", "Parent communicator", PRINT_NO),
+        INT_ANY_ENTRY("blacs_context", "BLACS context", PRINT_NO),
+#ifdef STORE_BUILD_CONFIG
+        INT_ENTRY("output_build_config", "Output the build config", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        cardinality_bool, enumerate_identity, output_build_config_is_valid, NULL, PRINT_NO),
+#endif
+        INT_ENTRY("solver", "Solver to use", ELPA_SOLVER_1STAGE, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        number_of_solvers, solver_enumerate, solver_is_valid, elpa_solver_name, PRINT_YES),
+        INT_ENTRY("gpu", "Use GPU acceleration", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        cardinality_bool, enumerate_identity, gpu_is_valid, NULL, PRINT_YES),
+        INT_ENTRY("is_skewsymmetric", "Matrix is skewsymmetic", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0,
+                        cardinality_bool, enumerate_identity, skewsymmetric_is_valid, NULL, PRINT_YES),
+        //default of gpu ussage for individual phases is 1. However, it is only evaluated, if GPU is used at all, which first has to be determined
+        //by the parameter gpu and presence of the device
+        INT_ENTRY("gpu_tridiag", "Use GPU acceleration for ELPA1 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
+        INT_ENTRY("gpu_solve_tridi", "Use GPU acceleration for ELPA solve tridi", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        cardinality_bool, enumerate_identity, valid_with_gpu, NULL, PRINT_YES),
+        INT_ENTRY("gpu_trans_ev", "Use GPU acceleration for ELPA1 trans ev", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
+        INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
+        // the routine has not been ported to GPU yet
+//        INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
+//                        cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
+        // the GPU implementation of this routine (together with the kernel) has been abandoned 
+//        INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
+//                        cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
+        INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
+        INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \
+                        number_of_real_kernels, real_kernel_enumerate, real_kernel_is_valid, real_kernel_name, PRINT_YES),
+        INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \
+                        number_of_complex_kernels, complex_kernel_enumerate, complex_kernel_is_valid, complex_kernel_name, PRINT_YES),
+
+        INT_ENTRY("min_tile_size", "Minimal tile size used internally in elpa1_tridiag and elpa2_bandred", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        min_tile_size_cardinality, min_tile_size_enumerate, min_tile_size_is_valid, NULL, PRINT_YES),
+        INT_ENTRY("intermediate_bandwidth", "Specifies the intermediate bandwidth in ELPA2 full->banded step. Must be a multiple of nblk", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        intermediate_bandwidth_cardinality, intermediate_bandwidth_enumerate, intermediate_bandwidth_is_valid, NULL, PRINT_YES),
+
+        INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL, PRINT_YES),
+        INT_ENTRY("stripewidth_real", "Stripewidth_real, default 48. Must be a multiple of 4", 48, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_REAL, \
+                        stripewidth_real_cardinality, stripewidth_real_enumerate, stripewidth_real_is_valid, NULL, PRINT_YES),
+        INT_ENTRY("stripewidth_complex", "Stripewidth_complex, default 96. Must be a multiple of 8", 96, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \
+                        stripewidth_complex_cardinality, stripewidth_complex_enumerate, stripewidth_complex_is_valid, NULL, PRINT_YES),
+
+        INT_ENTRY("max_stored_rows", "Maximum number of stored rows used in ELPA 1 backtransformation, default 63", 63, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        max_stored_rows_cardinality, max_stored_rows_enumerate, max_stored_rows_is_valid, NULL, PRINT_YES),
+#ifdef WITH_OPENMP
+        INT_ENTRY("omp_threads", "OpenMP threads used in ELPA, default 1", 1, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        omp_threads_cardinality, omp_threads_enumerate, omp_threads_is_valid, NULL, PRINT_YES),
+#else
+        INT_ENTRY("omp_threads", "OpenMP threads used in ELPA, default 1", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        omp_threads_cardinality, omp_threads_enumerate, omp_threads_is_valid, NULL, PRINT_YES),
+#endif
+        INT_ENTRY("cannon_buffer_size", "Increasing the buffer size might make it faster, but costs memory", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY, \
+                        cannon_buffer_size_cardinality, cannon_buffer_size_enumerate, cannon_buffer_size_is_valid, NULL, PRINT_YES),
+        //BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_REAL),
+        BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_REAL, PRINT_YES),
+        BOOL_ENTRY("timings", "Enable time measurement", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
+        BOOL_ENTRY("debug", "Emit verbose debugging messages", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
+        BOOL_ENTRY("print_flops", "Print FLOP rates on task 0", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
+        BOOL_ENTRY("measure_performance", "Also measure with flops (via papi) with the timings", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
+        BOOL_ENTRY("check_pd", "Check eigenvalues to be positive", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
+        BOOL_ENTRY("cannon_for_generalized", "Whether to use Cannons algorithm for the generalized EVP", 1, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
+};
+
+#define READONLY_DOUBLE_ENTRY(option_name, option_description) \
+        { \
+                BASE_ENTRY(option_name, option_description, 0, 1, 0) \
+        }
+
+static const elpa_index_double_entry_t double_entries[] = {
+        /* Empty for now */
+};
+
+void elpa_index_free(elpa_index_t index) {
+#define FREE_OPTION(TYPE, ...) \
+        free(index->TYPE##_options.values); \
+        free(index->TYPE##_options.is_set); \
+        free(index->TYPE##_options.notified);
+
+        FOR_ALL_TYPES(FREE_OPTION);
+
+        free(index);
+}
+
+static int compar(const void *a, const void *b) {
+        return strcmp(((elpa_index_int_entry_t *) a)->base.name,
+                      ((elpa_index_int_entry_t *) b)->base.name);
+}
+
+#define IMPLEMENT_FIND_ENTRY(TYPE, ...) \
+        static int find_##TYPE##_entry(char *name) { \
+                elpa_index_##TYPE##_entry_t *entry; \
+                elpa_index_##TYPE##_entry_t key = { .base = {.name = name} } ; \
+                size_t nmembers = nelements(TYPE##_entries); \
+                entry = lfind((const void*) &key, (const void *) TYPE##_entries, &nmembers, sizeof(elpa_index_##TYPE##_entry_t), compar); \
+                if (entry) { \
+                        return (entry - &TYPE##_entries[0]); \
+                } else { \
+                        return -1; \
+                } \
+        }
+FOR_ALL_TYPES(IMPLEMENT_FIND_ENTRY)
+
+
+#define IMPLEMENT_GETENV(TYPE, PRINTF_SPEC, ...) \
+        static int getenv_##TYPE(elpa_index_t index, const char *env_variable, enum NOTIFY_FLAGS notify_flag, int n, TYPE *value, const char *error_string) { \
+                int err; \
+                char *env_value = getenv(env_variable); \
+                if (env_value) { \
+                        err = elpa_##TYPE##_string_to_value(TYPE##_entries[n].base.name, env_value, value); \
+                        if (err != ELPA_OK) { \
+                                fprintf(stderr, "ELPA: Error interpreting environment variable %s with value '%s': %s\n", \
+                                                TYPE##_entries[n].base.name, env_value, elpa_strerr(err)); \
+                        } else {\
+                                const char *value_string = NULL; \
+                                if (elpa_##TYPE##_value_to_string(TYPE##_entries[n].base.name, *value, &value_string) == ELPA_OK) { \
+                                        if (!(index->TYPE##_options.notified[n] & notify_flag)) { \
+                                                if (elpa_index_is_printing_mpi_rank(index)) { \
+                                                        fprintf(stderr, "ELPA: %s '%s' is set to %s due to environment variable %s\n", \
+                                                                      error_string, TYPE##_entries[n].base.name, value_string, env_variable); \
+                                                } \
+                                                index->TYPE##_options.notified[n] |= notify_flag; \
+                                        } \
+                                } else { \
+                                        if (elpa_index_is_printing_mpi_rank(index)) { \
+                                                fprintf(stderr, "ELPA: %s '%s' is set to '" PRINTF_SPEC "' due to environment variable %s\n", \
+                                                        error_string, TYPE##_entries[n].base.name, *value, env_variable);\
+                                        } \
+                                } \
+                                return 1; \
+                        } \
+                } \
+                return 0; \
+        }
+FOR_ALL_TYPES(IMPLEMENT_GETENV)
+
+
+#define IMPLEMENT_GET_FUNCTION(TYPE, PRINTF_SPEC, SCANF_SPEC, ERROR_VALUE) \
+        TYPE elpa_index_get_##TYPE##_value(elpa_index_t index, char *name, int *error) { \
+                TYPE ret; \
+                if (sizeof(TYPE##_entries) == 0) { \
+                        return ELPA_ERROR_ENTRY_NOT_FOUND; \
+                } \
+                int n = find_##TYPE##_entry(name); \
+                if (n >= 0) { \
+                        int from_env = 0; \
+                        if (!TYPE##_entries[n].base.once && !TYPE##_entries[n].base.readonly) { \
+                                from_env = getenv_##TYPE(index, TYPE##_entries[n].base.env_force, NOTIFY_ENV_FORCE, n, &ret, "Option"); \
+                        } \
+                        if (!from_env) { \
+                                ret = index->TYPE##_options.values[n]; \
+                        } \
+                        if (error != NULL) { \
+                                *error = ELPA_OK; \
+                        } \
+                        return ret; \
+                } else { \
+                        if (error != NULL) { \
+                                *error = ELPA_ERROR_ENTRY_NOT_FOUND; \
+                        } \
+                        return ERROR_VALUE; \
+                } \
+        }
+FOR_ALL_TYPES(IMPLEMENT_GET_FUNCTION)
+
+
+#define IMPLEMENT_LOC_FUNCTION(TYPE, ...) \
+        TYPE* elpa_index_get_##TYPE##_loc(elpa_index_t index, char *name) { \
+                if (sizeof(TYPE##_entries) == 0) { \
+                        return NULL; \
+                } \
+                int n = find_##TYPE##_entry(name); \
+                if (n >= 0) { \
+                        return &index->TYPE##_options.values[n]; \
+                } else { \
+                        return NULL; \
+                } \
+        }
+FOR_ALL_TYPES(IMPLEMENT_LOC_FUNCTION)
+
+
+#define IMPLEMENT_SET_FUNCTION(TYPE, PRINTF_SPEC, ...) \
+        int elpa_index_set_##TYPE##_value(elpa_index_t index, char *name, TYPE value) { \
+                if (sizeof(TYPE##_entries) == 0) { \
+                        return ELPA_ERROR_ENTRY_NOT_FOUND; \
+                } \
+                int n = find_##TYPE##_entry(name); \
+                if (n < 0) { \
+                        return ELPA_ERROR_ENTRY_NOT_FOUND; \
+                }; \
+                if (TYPE##_entries[n].valid != NULL) { \
+                        if(!TYPE##_entries[n].valid(index, n, value)) { \
+                                return ELPA_ERROR_ENTRY_INVALID_VALUE; \
+                        }; \
+                } \
+                if (TYPE##_entries[n].base.once & index->TYPE##_options.is_set[n]) { \
+                        return ELPA_ERROR_ENTRY_ALREADY_SET; \
+                } \
+                if (TYPE##_entries[n].base.readonly) { \
+                        return ELPA_ERROR_ENTRY_READONLY; \
+                } \
+                index->TYPE##_options.values[n] = value; \
+                index->TYPE##_options.is_set[n] = 1; \
+                return ELPA_OK; \
+        }
+FOR_ALL_TYPES(IMPLEMENT_SET_FUNCTION)
+
+#define IMPLEMENT_SET_FROM_LOAD_FUNCTION(TYPE, PRINTF_SPEC, ...) \
+        int elpa_index_set_from_load_##TYPE##_value(elpa_index_t index, char *name, TYPE value, int explicit) { \
+                if (sizeof(TYPE##_entries) == 0) { \
+                        return ELPA_ERROR_ENTRY_NOT_FOUND; \
+                } \
+                int n = find_##TYPE##_entry(name); \
+                if (n < 0) { \
+                        return ELPA_ERROR_ENTRY_NOT_FOUND; \
+                }; \
+                index->TYPE##_options.values[n] = value; \
+                if(explicit) \
+                        index->TYPE##_options.is_set[n] = 1; \
+                return ELPA_OK; \
+        }
+FOR_ALL_TYPES(IMPLEMENT_SET_FROM_LOAD_FUNCTION)
+
+
+#define IMPLEMENT_IS_SET_FUNCTION(TYPE, ...) \
+        int elpa_index_##TYPE##_value_is_set(elpa_index_t index, char *name) { \
+                if (sizeof(TYPE##_entries) == 0) { \
+                        return ELPA_ERROR_ENTRY_NOT_FOUND; \
+                } \
+                int n = find_##TYPE##_entry(name); \
+                if (n >= 0) { \
+                        if (index->TYPE##_options.is_set[n]) { \
+                                return 1; \
+                        } else { \
+                                return 0; \
+                        } \
+                } else { \
+                        return ELPA_ERROR_ENTRY_NOT_FOUND; \
+                } \
+        }
+FOR_ALL_TYPES(IMPLEMENT_IS_SET_FUNCTION)
+
+
+int elpa_index_value_is_set(elpa_index_t index, char *name) {
+        int res = ELPA_ERROR;
+
+#define RET_IF_SET(TYPE, ...) \
+        res = elpa_index_##TYPE##_value_is_set(index, name); \
+        if (res >= 0) { \
+                return res; \
+        }
+
+        FOR_ALL_TYPES(RET_IF_SET)
+
+        fprintf(stderr, "ELPA Error: Could not find entry '%s'\n", name);
+        return res;
+}
+
+int elpa_index_int_is_valid(elpa_index_t index, char *name, int new_value) {
+        int n = find_int_entry(name); \
+        if (n >= 0) { \
+                if (int_entries[n].valid == NULL) {
+                        return ELPA_OK;
+                } else {
+                        return int_entries[n].valid(index, n, new_value) ? ELPA_OK : ELPA_ERROR;
+                }
+        }
+        return ELPA_ERROR_ENTRY_NOT_FOUND;
+}
+
+int elpa_int_value_to_string(char *name, int value, const char **string) {
+        int n = find_int_entry(name);
+        if (n < 0) {
+                return ELPA_ERROR_ENTRY_NOT_FOUND;
+        }
+        if (int_entries[n].to_string == NULL) {
+                return ELPA_ERROR_ENTRY_NO_STRING_REPRESENTATION;
+        }
+        *string = int_entries[n].to_string(value);
+        return ELPA_OK;
+}
+
+
+int elpa_int_value_to_strlen(char *name, int value) {
+        const char *string = NULL;
+        elpa_int_value_to_string(name, value, &string);
+        if (string == NULL) {
+                return 0;
+        } else {
+                return strlen(string);
+        }
+}
+
+
+int elpa_index_int_value_to_strlen(elpa_index_t index, char *name) {
+        int n = find_int_entry(name);
+        if (n < 0) {
+                return 0;
+        }
+        return elpa_int_value_to_strlen(name, index->int_options.values[n]);
+}
+
+
+int elpa_int_string_to_value(char *name, char *string, int *value) {
+        int n = find_int_entry(name);
+        if (n < 0) {
+                return ELPA_ERROR_ENTRY_NOT_FOUND;
+        }
+
+        if (int_entries[n].to_string == NULL) {
+                int val, ret;
+                ret = sscanf(string, "%d", &val);
+                if (ret == 1) {
+                        *value = val;
+                        return ELPA_OK;
+                } else {
+                        return ELPA_ERROR_ENTRY_INVALID_VALUE;
+                }
+        }
+
+        for (int i = 0; i < int_entries[n].cardinality(NULL); i++) {
+                int candidate = int_entries[n].enumerate(NULL, i);
+                if (strcmp(string, int_entries[n].to_string(candidate)) == 0) {
+                        *value = candidate;
+                        return ELPA_OK;
+                }
+        }
+        return ELPA_ERROR_ENTRY_INVALID_VALUE;
+}
+
+int elpa_double_string_to_value(char *name, char *string, double *value) {
+        double val;
+        int ret = sscanf(string, "%lf", &val);
+        if (ret == 1) {
+                *value = val;
+                return ELPA_OK;
+        } else {
+                /* \todo: remove */
+                fprintf(stderr, "ELPA: DEBUG: Could not parse double value '%s' for option '%s'\n", string, name);
+                return ELPA_ERROR_ENTRY_INVALID_VALUE;
+        }
+}
+
+int elpa_double_value_to_string(char *name, double value, const char **string) {
+        return ELPA_ERROR_ENTRY_NO_STRING_REPRESENTATION;
+}
+
+int elpa_option_cardinality(char *name) {
+        int n = find_int_entry(name);
+        if (n < 0 || !int_entries[n].cardinality) {
+                return ELPA_ERROR_ENTRY_NOT_FOUND;
+        }
+        return int_entries[n].cardinality(NULL);
+}
+
+int elpa_option_enumerate(char *name, int i) {
+        int n = find_int_entry(name);
+        if (n < 0 || !int_entries[n].enumerate) {
+                return 0;
+        }
+        return int_entries[n].enumerate(NULL, i);
+}
+
+
+/* Helper functions for simple int entries */
+static int cardinality_bool(elpa_index_t index) {
+        return 2;
+}
+
+static int valid_bool(elpa_index_t index, int n, int new_value) {
+        return (0 <= new_value) && (new_value < 2);
+}
+
+static int enumerate_identity(elpa_index_t index, int i) {
+        return i;
+}
+
+/* Helper functions for specific options */
+
+#define NAME_CASE(name, value, ...) \
+        case value: \
+                return #name;
+
+#define VALID_CASE(name, value) \
+        case value: \
+                return 1;
+
+#define VALID_CASE_3(name, value, available, other_checks) \
+        case value: \
+                return available && (other_checks(value));
+
+static const char* elpa_solver_name(int solver) {
+        switch(solver) {
+                ELPA_FOR_ALL_SOLVERS(NAME_CASE)
+                default:
+                        return "(Invalid solver)";
+        }
+}
+
+static int number_of_solvers(elpa_index_t index) {
+        return ELPA_NUMBER_OF_SOLVERS;
+}
+
+static int solver_enumerate(elpa_index_t index, int i) {
+#define OPTION_RANK(name, value, ...) \
+        +(value >= sizeof(array_of_size_value)/sizeof(int) ? 0 : 1)
+
+#define EMPTY()
+#define DEFER1(m) m EMPTY()
+#define EVAL(...) __VA_ARGS__
+
+#define ENUMERATE_CASE(name, value, ...) \
+        { const int array_of_size_value[value]; \
+        case 0 DEFER1(INNER_ITERATOR)()(OPTION_RANK): \
+                return value; }
+
+        switch(i) {
+#define INNER_ITERATOR() ELPA_FOR_ALL_SOLVERS
+                EVAL(ELPA_FOR_ALL_SOLVERS(ENUMERATE_CASE))
+#undef INNER_ITERATOR
+                default:
+                        return 0;
+        }
+}
+
+
+static int solver_is_valid(elpa_index_t index, int n, int new_value) {
+        switch(new_value) {
+                ELPA_FOR_ALL_SOLVERS(VALID_CASE)
+                default:
+                        return 0;
+        }
+}
+
+static int number_of_real_kernels(elpa_index_t index) {
+        return ELPA_2STAGE_NUMBER_OF_REAL_KERNELS;
+}
+
+static int real_kernel_enumerate(elpa_index_t index,int i) {
+        switch(i) {
+#define INNER_ITERATOR() ELPA_FOR_ALL_2STAGE_REAL_KERNELS
+                EVAL(ELPA_FOR_ALL_2STAGE_REAL_KERNELS(ENUMERATE_CASE))
+#undef INNER_ITERATOR
+                default:
+                        return 0;
+        }
+}
+
+static const char *real_kernel_name(int kernel) {
+        switch(kernel) {
+                ELPA_FOR_ALL_2STAGE_REAL_KERNELS(NAME_CASE)
+                default:
+                        return "(Invalid real kernel)";
+        }
+}
+
+#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
+        kernel_number == ELPA_2STAGE_REAL_GPU ? 0 : 1
+// currently the GPU kernel is never valid
+// previously:       kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
+
+static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
+        int solver = elpa_index_get_int_value(index, "solver", NULL);
+        if (solver == ELPA_SOLVER_1STAGE) {
+                return new_value == ELPA_2STAGE_REAL_DEFAULT;
+        }
+        int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
+        switch(new_value) {
+                ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
+                default:
+                        return 0;
+        }
+}
+
+static int number_of_complex_kernels(elpa_index_t index) {
+        return ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS;
+}
+
+
+static int complex_kernel_enumerate(elpa_index_t index,int i) {
+        switch(i) {
+#define INNER_ITERATOR() ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS
+                EVAL(ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(ENUMERATE_CASE))
+#undef INNER_ITERATOR
+                default:
+                        return 0;
+        }
+}
+
+static const char *complex_kernel_name(int kernel) {
+        switch(kernel) {
+                ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(NAME_CASE)
+                default:
+                        return "(Invalid complex kernel)";
+        }
+}
+
+#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
+        kernel_number == ELPA_2STAGE_COMPLEX_GPU ? 0 : 1
+// currenttly the GPU kernel is never valid
+// previously:       kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
+
+static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
+        int solver = elpa_index_get_int_value(index, "solver", NULL);
+        if (solver == ELPA_SOLVER_1STAGE) {
+                return new_value == ELPA_2STAGE_COMPLEX_DEFAULT;
+        }
+        int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
+        switch(new_value) {
+                ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
+                default:
+                        return 0;
+        }
+}
+
+static const char* elpa_autotune_level_name(int level) {
+        switch(level) {
+                ELPA_FOR_ALL_AUTOTUNE_LEVELS(NAME_CASE)
+                default:
+                        return "(Invalid autotune level)";
+        }
+}
+
+static const char* elpa_autotune_domain_name(int domain) {
+        switch(domain) {
+                ELPA_FOR_ALL_AUTOTUNE_DOMAINS(NAME_CASE)
+                default:
+                        return "(Invalid autotune domain)";
+        }
+}
+
+static int na_is_valid(elpa_index_t index, int n, int new_value) {
+        return new_value > 0;
+}
+
+static int nev_is_valid(elpa_index_t index, int n, int new_value) {
+        if (!elpa_index_int_value_is_set(index, "na")) {
+                return 0;
+        }
+        return 0 <= new_value && new_value <= elpa_index_get_int_value(index, "na", NULL);
+}
+
+static int is_positive(elpa_index_t index, int n, int new_value) {
+        return new_value > 0;
+}
+
+static int bw_is_valid(elpa_index_t index, int n, int new_value) {
+        int na;
+        if (elpa_index_int_value_is_set(index, "na") != 1) {
+                return 0;
+        }
+
+        na = elpa_index_get_int_value(index, "na", NULL);
+        return (0 <= new_value) && (new_value < na);
+}
+
+static int output_build_config_is_valid(elpa_index_t index, int n, int new_value) {
+        return new_value == 0 || new_value == 1;
+}
+
+static int gpu_is_valid(elpa_index_t index, int n, int new_value) {
+        return new_value == 0 || new_value == 1;
+}
+
+static int skewsymmetric_is_valid(elpa_index_t index, int n, int new_value) {
+        return new_value == 0 || new_value == 1;
+}
+
+static int band_to_full_cardinality(elpa_index_t index) {
+	return 10;
+}
+static int band_to_full_enumerate(elpa_index_t index, int i) {
+	return i+1;
+}
+
+// TODO shouldnt it be only for ELPA2??
+static int band_to_full_is_valid(elpa_index_t index, int n, int new_value) {
+	int max_block=10;
+        return (1 <= new_value) && (new_value <= max_block);
+}
+
+static int stripewidth_real_cardinality(elpa_index_t index) {
+	return 17;
+}
+
+static int stripewidth_complex_cardinality(elpa_index_t index) {
+	return 17;
+}
+
+static int stripewidth_real_enumerate(elpa_index_t index, int i) {
+	switch(i) {
+	  case 0:
+	    return 32;
+	  case 1:
+	    return 36;
+	  case 2:
+	    return 40;
+	  case 3:
+	    return 44;
+	  case 4:
+	    return 48;
+	  case 5:
+	    return 52;
+	  case 6:
+	    return 56;
+	  case 7:
+	    return 60;
+	  case 8:
+	    return 64;
+	  case 9:
+	    return 68;
+	  case 10:
+	    return 72;
+	  case 11:
+	    return 76;
+	  case 12:
+	    return 80;
+	  case 13:
+	    return 84;
+	  case 14:
+	    return 88;
+	  case 15:
+	    return 92;
+	  case 16:
+	    return 96;
+	}
+}
+
+static int stripewidth_complex_enumerate(elpa_index_t index, int i) {
+	switch(i) {
+	  case 0:
+	    return 48;
+	  case 1:
+	    return 56;
+	  case 2:
+	    return 64;
+	  case 3:
+	    return 72;
+	  case 4:
+	    return 80;
+	  case 5:
+	    return 88;
+	  case 6:
+	    return 96;
+	  case 7:
+	    return 104;
+	  case 8:
+	    return 112;
+	  case 9:
+	    return 120;
+	  case 10:
+	    return 128;
+	  case 11:
+	    return 136;
+	  case 12:
+	    return 144;
+	  case 13:
+	    return 152;
+	  case 14:
+	    return 160;
+	  case 15:
+	    return 168;
+	  case 16:
+	    return 176;
+	}
+}
+
+static int stripewidth_real_is_valid(elpa_index_t index, int n, int new_value) {
+	return (32 <= new_value) && (new_value <= 96);
+}
+
+static int stripewidth_complex_is_valid(elpa_index_t index, int n, int new_value) {
+	return (48 <= new_value) && (new_value <= 176);
+}
+
+static int omp_threads_cardinality(elpa_index_t index) {
+	int max_threads;
+#ifdef WITH_OPENMP
+	if (set_max_threads_glob == 0) {
+		max_threads_glob = omp_get_max_threads();
+		set_max_threads_glob = 1;
+	}
+#else
+	max_threads_glob = 1;
+	set_max_threads_glob = 1;
+#endif
+	max_threads = max_threads_glob;
+	return max_threads;
+}
+
+static int omp_threads_enumerate(elpa_index_t index, int i) {
+        return i + 1;
+}
+
+static int omp_threads_is_valid(elpa_index_t index, int n, int new_value) {
+        int max_threads;
+#ifdef WITH_OPENMP
+	if (set_max_threads_glob == 0) {
+		max_threads_glob = omp_get_max_threads();
+		set_max_threads_glob = 1;
+	}
+#else
+	max_threads_glob = 1;
+	set_max_threads_glob = 1;
+#endif
+	max_threads = max_threads_glob;
+        return (1 <= new_value) && (new_value <= max_threads);
+}
+
+
+static int valid_with_gpu(elpa_index_t index, int n, int new_value) {
+        int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
+        if (gpu_is_active == 1) {
+                return ((new_value == 0 ) || (new_value == 1));
+        }
+        else {
+                return new_value == 0;
+        }
+}
+
+static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value) {
+        int solver = elpa_index_get_int_value(index, "solver", NULL);
+        int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
+        if ((solver == ELPA_SOLVER_1STAGE) && (gpu_is_active == 1)) {
+                return ((new_value == 0 ) || (new_value == 1));
+        }
+        else {
+                return new_value == 0;
+        }
+}
+
+static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value) {
+        int solver = elpa_index_get_int_value(index, "solver", NULL);
+        int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
+        if ((solver == ELPA_SOLVER_2STAGE) && (gpu_is_active == 1)) {
+                return ((new_value == 0 ) || (new_value == 1));
+        }
+        else {
+                return new_value == 0;
+        }
+}
+
+static int max_stored_rows_cardinality(elpa_index_t index) {
+	return 8;
+}
+
+static int max_stored_rows_enumerate(elpa_index_t index, int i) {
+	switch(i) {
+	  case 0:
+	    return 15;
+	  case 1:
+	    return 31;
+	  case 2:
+	    return 47;
+	  case 3:
+	    return 63;
+	  case 4:
+	    return 79;
+	  case 5:
+	    return 95;
+	  case 6:
+	    return 111;
+	  case 7:
+	    return 127;
+	}
+}
+
+static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value) {
+        int solver = elpa_index_get_int_value(index, "solver", NULL);
+        if (solver == ELPA_SOLVER_2STAGE) {
+                return new_value == 15;
+        } else {
+                return (15 <= new_value) && (new_value <= 127);
+        }
+}
+
+
+// TODO: this shoudl definitely be improved (too many options to test in autotuning)
+static const int TILE_SIZE_STEP = 128;
+
+static int min_tile_size_cardinality(elpa_index_t index) {
+        int na;
+        if(index == NULL)
+                return 0;
+        if (elpa_index_int_value_is_set(index, "na") != 1) {
+                return 0;
+        }
+        na = elpa_index_get_int_value(index, "na", NULL);
+        return na/TILE_SIZE_STEP;
+}
+
+static int min_tile_size_enumerate(elpa_index_t index, int i) {
+        return (i+1) * TILE_SIZE_STEP;
+}
+
+static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value) {
+       return new_value % TILE_SIZE_STEP == 0;
+}
+
+static int intermediate_bandwidth_cardinality(elpa_index_t index) {
+        int na, nblk;
+        if(index == NULL)
+                return 0;
+        if (elpa_index_int_value_is_set(index, "na") != 1) {
+                return 0;
+        }
+        na = elpa_index_get_int_value(index, "na", NULL);
+
+        if (elpa_index_int_value_is_set(index, "nblk") != 1) {
+                return 0;
+        }
+        nblk = elpa_index_get_int_value(index, "nblk", NULL);
+
+        return na/nblk;
+}
+
+static int intermediate_bandwidth_enumerate(elpa_index_t index, int i) {
+        int nblk;
+        if(index == NULL)
+                return 0;
+        if (elpa_index_int_value_is_set(index, "nblk") != 1) {
+                return 0;
+        }
+        nblk = elpa_index_get_int_value(index, "nblk", NULL);
+
+        return (i+1) * nblk;
+}
+
+static int intermediate_bandwidth_is_valid(elpa_index_t index, int n, int new_value) {
+        int na, nblk;
+        if (elpa_index_int_value_is_set(index, "na") != 1) {
+                return 0;
+        }
+        na = elpa_index_get_int_value(index, "na", NULL);
+
+        if (elpa_index_int_value_is_set(index, "nblk") != 1) {
+                return 0;
+        }
+        nblk = elpa_index_get_int_value(index, "nblk", NULL);
+
+        int solver = elpa_index_get_int_value(index, "solver", NULL);
+        if (solver == ELPA_SOLVER_1STAGE) {
+                return new_value == nblk;
+        } else {
+                if((new_value <= 1 ) || (new_value > na ))
+                  return 0;
+                if(new_value % nblk != 0) {
+                  fprintf(stderr, "intermediate bandwidth has to be multiple of nblk\n");
+                  return 0;
+                }
+        }
+}
+
+static int cannon_buffer_size_cardinality(elpa_index_t index) {
+        return 2;
+}
+
+static int cannon_buffer_size_enumerate(elpa_index_t index, int i) {
+        int np_rows;
+        if(index == NULL)
+                return 0;
+        if (elpa_index_int_value_is_set(index, "num_process_rows") != 1) {
+                return 0;
+        }
+        np_rows = elpa_index_get_int_value(index, "num_process_rows", NULL);
+
+        // TODO: 0 is both error code and legal value?
+        if(i == 0)
+          return 0;
+        else
+          return np_rows - 1;
+}
+
+static int cannon_buffer_size_is_valid(elpa_index_t index, int n, int new_value) {
+        int np_rows;
+        if(index == NULL)
+                return 0;
+        if (elpa_index_int_value_is_set(index, "num_process_rows") != 1) {
+                return 0;
+        }
+        np_rows = elpa_index_get_int_value(index, "num_process_rows", NULL);
+
+        return ((new_value >= 0) && (new_value < np_rows));
+}
+
+elpa_index_t elpa_index_instance() {
+        elpa_index_t index = (elpa_index_t) calloc(1, sizeof(struct elpa_index_struct));
+
+#define ALLOCATE(TYPE, PRINTF_SPEC, ...) \
+        index->TYPE##_options.values = (TYPE*) calloc(nelements(TYPE##_entries), sizeof(TYPE)); \
+        index->TYPE##_options.is_set = (int*) calloc(nelements(TYPE##_entries), sizeof(int)); \
+        index->TYPE##_options.notified = (int*) calloc(nelements(TYPE##_entries), sizeof(int)); \
+        for (int n = 0; n < nelements(TYPE##_entries); n++) { \
+                TYPE default_value = TYPE##_entries[n].default_value; \
+                if (!TYPE##_entries[n].base.once && !TYPE##_entries[n].base.readonly) { \
+                        getenv_##TYPE(index, TYPE##_entries[n].base.env_default, NOTIFY_ENV_DEFAULT, n, &default_value, "Default for option"); \
+                } \
+                index->TYPE##_options.values[n] = default_value; \
+        }
+
+        FOR_ALL_TYPES(ALLOCATE)
+
+        return index;
+}
+
+static int is_tunable_but_overriden(elpa_index_t index, int i, int autotune_level, int autotune_domain) {
+        return (int_entries[i].autotune_level != 0) &&
+               (int_entries[i].autotune_level <= autotune_level) &&
+               (int_entries[i].autotune_domain & autotune_domain) &&
+               (index->int_options.is_set[i]);
+}
+
+static int is_tunable(elpa_index_t index, int i, int autotune_level, int autotune_domain) {
+        return (int_entries[i].autotune_level != 0) &&
+               (int_entries[i].autotune_level <= autotune_level) &&
+               (int_entries[i].autotune_domain & autotune_domain) &&
+               (!index->int_options.is_set[i]);
+}
+
+int elpa_index_autotune_cardinality(elpa_index_t index, int autotune_level, int autotune_domain) {
+        int N = 1;
+
+        for (int i = 0; i < nelements(int_entries); i++) { \
+                if (is_tunable(index, i, autotune_level, autotune_domain)) {
+                        N *= int_entries[i].cardinality(index);
+                }
+        }
+        return N;
+}
+
+void elpa_index_print_int_parameter(elpa_index_t index, char* buff, int i)
+{
+        int value = index->int_options.values[i];
+        sprintf(buff, "%s = ", int_entries[i].base.name);
+        if (int_entries[i].to_string) {
+                sprintf(buff, "%s%d -> %s\n", buff, value, int_entries[i].to_string(value));
+        } else {
+                sprintf(buff, "%s%d\n", buff, value);
+        }
+}
+
+int elpa_index_set_autotune_parameters(elpa_index_t index, int autotune_level, int autotune_domain, int current) {
+        int current_cpy = current;
+        char buff[100];
+        int debug = elpa_index_get_int_value(index, "debug", NULL);
+
+        //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, "***Trying a new autotuning index %d\n", current);
+        for (int i = 0; i < nelements(int_entries); i++) {
+                if (is_tunable(index, i, autotune_level, autotune_domain)) {
+                        int value = int_entries[i].enumerate(index, current_cpy % int_entries[i].cardinality(index));
+                        //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, "  * val[%d] = %d -> %d\n", i, current_cpy % int_entries[i].cardinality(index), value);
+                        /* Try to set option i to that value */
+                        if (int_entries[i].valid(index, i, value)) {
+                                index->int_options.values[i] = value;
+                        } else {
+                                //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, "  *NOT VALID becaluse of i %d (%s) and value %d translated to %d\n", i, int_entries[i].base.name, current_cpy % int_entries[i].cardinality(index), value);
+                                return 0;
+                        }
+                        current_cpy /= int_entries[i].cardinality(index);
+                }
+        }
+        if (debug == 1 && elpa_index_is_printing_mpi_rank(index)) {
+                fprintf(stderr, "\n*** AUTOTUNING: setting a new combination of parameters, idx %d ***\n", current);
+                elpa_index_print_autotune_parameters(index, autotune_level, autotune_domain);
+                fprintf(stderr, "***\n\n");
+        }
+
+        /* Could set all values */
+        return 1;
+}
+
+int elpa_index_print_autotune_parameters(elpa_index_t index, int autotune_level, int autotune_domain) {
+        char buff[100];
+        if (elpa_index_is_printing_mpi_rank(index)) {
+                for (int i = 0; i < nelements(int_entries); i++) {
+                        if (is_tunable(index, i, autotune_level, autotune_domain)) {
+                                elpa_index_print_int_parameter(index, buff, i);
+                                fprintf(stderr, "%s", buff);
+                        }
+                }
+        }
+        return 1;
+}
+
+int elpa_index_print_autotune_state(elpa_index_t index, int autotune_level, int autotune_domain, int min_loc,
+                                    double min_val, int current, int cardinality, char* file_name) {
+        char buff[100];
+        elpa_index_t index_best;
+        int min_loc_cpy = min_loc;
+        FILE *f;
+
+        // get index with the currently best parameters
+        index_best = elpa_index_instance();
+
+        if(min_loc_cpy > -1){
+                for (int i = 0; i < nelements(int_entries); i++) {
+                        if (is_tunable(index, i, autotune_level, autotune_domain)) {
+
+                                int value = int_entries[i].enumerate(index, min_loc_cpy % int_entries[i].cardinality(index));
+                                /* we are setting the value for output only, we do not need to check consistency */
+                                index_best->int_options.values[i] = value;
+                                min_loc_cpy /= int_entries[i].cardinality(index);
+                        }
+                }
+        }
+        if (elpa_index_is_printing_mpi_rank(index)) {
+                int output_to_file = (strlen(file_name) > 0);
+                if(output_to_file) {
+                        f = fopen(file_name, "w");
+                        if(f == NULL){
+                                fprintf(stderr, "Cannot open file %s in elpa_index_print_autotune_state\n", file_name);
+                                return 0;
+                        }
+                }
+                else {
+                        f = stdout;
+                }
+
+                if(!output_to_file)
+                        fprintf(f, "\n");
+                fprintf(f, "*** AUTOTUNING STATE ***\n");
+                fprintf(f, "** This is the state of the autotuning object\n");
+                fprintf(f, "autotune_level = %d -> %s\n", autotune_level, elpa_autotune_level_name(autotune_level));
+                fprintf(f, "autotune_domain = %d -> %s\n", autotune_domain, elpa_autotune_domain_name(autotune_domain));
+                fprintf(f, "autotune_cardinality = %d\n", cardinality);
+                fprintf(f, "current_idx = %d\n", current);
+                fprintf(f, "best_idx = %d\n", min_loc);
+                fprintf(f, "best_time = %g\n", min_val);
+                if(min_loc_cpy > -1) {
+                        fprintf(f, "** The following parameters are autotuned with so far the best values\n");
+                        for (int i = 0; i < nelements(int_entries); i++) {
+                                if (is_tunable(index, i, autotune_level, autotune_domain)) {
+                                        elpa_index_print_int_parameter(index_best, buff, i);
+                                        fprintf(f, "%s", buff);
+                                }
+                        }
+                        fprintf(f, "** The following parameters would be autotuned on the selected autotuning level, but were overridden by the set() method\n");
+                        for (int i = 0; i < nelements(int_entries); i++) {
+                                if (is_tunable_but_overriden(index, i, autotune_level, autotune_domain)) {
+                                        elpa_index_print_int_parameter(index, buff, i);
+                                        fprintf(f, "%s", buff);
+                                }
+                        }
+                }else{
+                        fprintf(f, "** No output after first step\n");
+                }
+                fprintf(f, "*** END OF AUTOTUNING STATE ***\n");
+
+                if(output_to_file)
+                        fclose(f);
+        }
+        elpa_index_free(index_best);
+
+        return 1;
+}
+
+const int LEN =1000;
+
+#define IMPLEMENT_LOAD_LINE(TYPE, PRINTF_SPEC, SCANF_SPEC, ...) \
+        static int load_##TYPE##_line(FILE* f, const char* expected, TYPE* val) { \
+                char line[LEN], s[LEN]; \
+                int error = 0; \
+                TYPE n; \
+                if(fgets(line, LEN, f) == NULL){ \
+                        fprintf(stderr, "Loading autotuning state error: line is not there\n"); \
+                        error = 1; \
+                } else{ \
+                        sscanf(line, "%s = " SCANF_SPEC "\n", s, &n); \
+                        if(strcmp(s, expected) != 0){ \
+                                fprintf(stderr, "Loading autotuning state error: expected %s, got %s\n", expected, s); \
+                                error = 1;\
+                        } else{ \
+                                *val = n; \
+                        } \
+                } \
+                if(error){ \
+                        fprintf(stderr, "Autotuning state file corrupted\n"); \
+                        return 0; \
+                } \
+                return 1; \
+        }
+FOR_ALL_TYPES(IMPLEMENT_LOAD_LINE)
+
+int elpa_index_load_autotune_state(elpa_index_t index, int* autotune_level, int* autotune_domain, int* min_loc,
+                                    double* min_val, int* current, int* cardinality, char* file_name) {
+        char line[LEN];
+        FILE *f;
+
+        //TODO: should be broadcasted, instead of read on all ranks
+        //if(elpa_index_is_printing_mpi_rank(index)){
+                f = fopen(file_name, "r");
+
+                if (f == NULL) {
+                        fprintf(stderr, "Cannont open file %s\n", file_name);
+                        return(0);
+                }
+
+
+                if(fgets(line, LEN, f) == NULL) return 0;
+                if(fgets(line, LEN, f) == NULL) return 0;
+                if(! load_int_line(f, "autotune_level", autotune_level)) return 0;
+                if(! load_int_line(f, "autotune_domain", autotune_domain)) return 0;
+                if(! load_int_line(f, "autotune_cardinality", cardinality)) return 0;
+                if(! load_int_line(f, "current_idx", current)) return 0;
+                if(! load_int_line(f, "best_idx", min_loc)) return 0;
+                if(! load_double_line(f, "best_time", min_val)) return 0;
+                fclose(f);
+       // }
+
+        return 1;
+}
+
+const char STRUCTURE_PARAMETERS[] = "* Parameters describing structure of the computation:\n";
+const char EXPLICIT_PARAMETERS[] = "* Parameters explicitly set by the user:\n";
+const char DEFAULT_PARAMETERS[] = "* Parameters with default or environment value:\n";
+
+int elpa_index_print_settings(elpa_index_t index, char *file_name) {
+        const int LEN =10000;
+        char out_structure[LEN], out_set[LEN], out_defaults[LEN], out_nowhere[LEN], buff[100];
+        char (*out)[LEN];
+        FILE *f;
+
+        sprintf(out_structure, "%s", STRUCTURE_PARAMETERS);
+        sprintf(out_set, "%s", EXPLICIT_PARAMETERS);
+        sprintf(out_defaults, "%s", DEFAULT_PARAMETERS);
+        sprintf(out_nowhere, "Not to be printed:\n");
+        if(elpa_index_is_printing_mpi_rank(index)){
+                for (int i = 0; i < nelements(int_entries); i++) {
+                        if(int_entries[i].base.print_flag == PRINT_STRUCTURE) {
+                                out = &out_structure;
+                        } else if(int_entries[i].base.print_flag == PRINT_YES && index->int_options.is_set[i]) {
+                                out = &out_set;
+                        } else if(int_entries[i].base.print_flag == PRINT_YES && !index->int_options.is_set[i]) {
+                                out = &out_defaults;
+                        } else
+                                out = &out_nowhere;
+                        elpa_index_print_int_parameter(index, buff, i);
+                        sprintf(*out, "%s%s", *out, buff);
+                }
+                int output_to_file = (strlen(file_name) > 0);
+                if(output_to_file) {
+                        f = fopen(file_name, "w");
+                        if(f == NULL){
+                                fprintf(stderr, "Cannot open file %s in elpa_index_print_settings\n", file_name);
+                                return 0;
+                        }
+                }
+                else {
+                        f = stdout;
+                }
+
+                fprintf(f, "*** ELPA STATE ***\n");
+                fprintf(f, "%s%s%s", out_structure, out_set, out_defaults);
+                fprintf(f, "*** END OF ELPA STATE ***\n");
+                if(output_to_file)
+                        fclose(f);
+        }
+
+        return 1;
+}
+
+int elpa_index_load_settings(elpa_index_t index, char *file_name) {
+        const int LEN = 1000;
+        char line[LEN], s[LEN];
+        int n;
+        FILE *f;
+        int skip, explicit;
+
+        //TODO: should be broadcasted, instead of read on all ranks
+        //if(elpa_index_is_printing_mpi_rank(index)){
+                f = fopen(file_name, "r");
+
+                if (f == NULL) {
+                        fprintf(stderr, "Cannont open file %s\n", file_name);
+                        return(0);
+                }
+
+                skip = 1;
+                explicit = 0;
+
+                while ((fgets(line, LEN, f)) != NULL) {
+                        if(strcmp(line, EXPLICIT_PARAMETERS) == 0){
+                                skip = 0;
+                                explicit = 1;
+                        }
+                        if(strcmp(line, DEFAULT_PARAMETERS) == 0){
+                                skip = 0;
+                                explicit = 0;
+                        }
+
+                        if(line[0] != '\n' && line[0] != '*'){
+                                sscanf(line, "%s = %d\n", s, &n);
+                                if(! skip){
+                                        int error = elpa_index_set_from_load_int_value(index, s, n, explicit);
+                                }
+                        }
+                }
+                fclose(f);
+       // }
+
+        return 1;
+}
+
+
+int elpa_index_is_printing_mpi_rank(elpa_index_t index)
+{
+  int process_id;
+  if(elpa_index_int_value_is_set(index, "process_id")){
+    process_id = elpa_index_get_int_value(index, "process_id", NULL);
+    return (process_id == 0);
+  }
+  printf("Warning: process_id not set, printing on all MPI ranks. This can happen with legacy API.");
+  return 1;
+}
diff -Nru elpa-2016.05.001/src/elpa_index.h elpa-2019.11.001/src/elpa_index.h
--- elpa-2016.05.001/src/elpa_index.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/elpa_index.h	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,502 @@
+/*
+!
+!    Copyright 2017, L. Hüdepohl and A. Marek, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+*/
+#pragma once
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <search.h>
+#include <math.h>
+
+#include "config.h"
+#include <elpa/elpa.h>
+
+#define nelements(x) (sizeof(x)/sizeof(x[0]))
+
+#define FOR_ALL_TYPES(X) \
+        X(int, "%d", "%d", -1) \
+        X(double, "%g", "%lg", NAN)
+
+/* A simple structure for storing values to a pre-set
+ * number of keys */
+
+/* Forward declaration of configuration structure */
+typedef struct elpa_index_struct* elpa_index_t;
+
+/* Function type for the cardinality */
+typedef int (*elpa_index_cardinality_t)(elpa_index_t index);
+
+/* Function type to enumerate all possible values, starting from 0 */
+typedef int (*elpa_index_enumerate_int_option_t)(elpa_index_t index, int i);
+
+/* Function types to check the validity of a value */
+typedef int (*elpa_index_valid_int_t)(elpa_index_t index, int n, int new_value);
+typedef int (*elpa_index_valid_double_t)(elpa_index_t index, int n, double new_value);
+
+/* Function type to give a string representation of a value */
+typedef const char* (*elpa_index_to_string_int_t)(int n);
+
+
+typedef struct {
+        char *name;
+        char *description;
+        char *env_default;
+        char *env_force;
+        int once;
+        int readonly;
+        int print_flag;
+} elpa_index_entry_t;
+
+
+typedef struct {
+        elpa_index_entry_t base;
+        int default_value;
+        int autotune_level;
+        int autotune_domain;
+        elpa_index_valid_int_t valid;
+        elpa_index_cardinality_t cardinality;
+        elpa_index_enumerate_int_option_t enumerate;
+        elpa_index_to_string_int_t to_string;
+} elpa_index_int_entry_t;
+
+
+typedef struct {
+        elpa_index_entry_t base;
+        double default_value;
+        elpa_index_valid_double_t valid;
+} elpa_index_double_entry_t;
+
+enum NOTIFY_FLAGS {
+        NOTIFY_ENV_DEFAULT = (1<<0),
+        NOTIFY_ENV_FORCE   = (1<<1),
+};
+
+enum PRINT_FLAGS {
+        PRINT_STRUCTURE,
+        PRINT_YES,
+        PRINT_NO,
+};
+
+struct elpa_index_struct {
+#define STRUCT_MEMBERS(TYPE, ...) \
+        struct { \
+        TYPE *values; \
+        int *is_set; \
+        int *notified; \
+        } TYPE##_options;
+        FOR_ALL_TYPES(STRUCT_MEMBERS)
+};
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_instance_c() result(index) bind(C, name="elpa_index_instance")
+ !f>     import c_ptr
+ !f>     type(c_ptr) :: index
+ !f>   end function
+ !f> end interface
+ */
+elpa_index_t elpa_index_instance();
+
+
+/*
+ !f> interface
+ !f>   subroutine elpa_index_free_c(index) bind(C, name="elpa_index_free")
+ !f>     import c_ptr
+ !f>     type(c_ptr), value :: index
+ !f>   end subroutine
+ !f> end interface
+ */
+void elpa_index_free(elpa_index_t index);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_get_int_value_c(index, name, success) result(value) &
+ !f>       bind(C, name="elpa_index_get_int_value")
+ !f>     import c_ptr, c_int, c_char
+ !f>     type(c_ptr), value                         :: index
+ !f>     character(kind=c_char), intent(in)         :: name(*)
+ !f>#ifdef USE_FORTRAN2008
+ !f>     integer(kind=c_int), optional, intent(out) :: success
+ !f>#else
+ !f>     integer(kind=c_int), intent(out)           :: success
+ !f>#endif
+ !f>     integer(kind=c_int)                        :: value
+ !f>   end function
+ !f> end interface
+ */
+int elpa_index_get_int_value(elpa_index_t index, char *name, int *success);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_set_int_value_c(index, name, value) result(success) &
+ !f>       bind(C, name="elpa_index_set_int_value")
+ !f>     import c_ptr, c_int, c_char
+ !f>     type(c_ptr), value                    :: index
+ !f>     character(kind=c_char), intent(in)    :: name(*)
+ !f>     integer(kind=c_int),intent(in), value :: value
+ !f>     integer(kind=c_int)                   :: success
+ !f>   end function
+ !f> end interface
+ */
+int elpa_index_set_int_value(elpa_index_t index, char *name, int value);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_int_value_is_set_c(index, name) result(success) bind(C, name="elpa_index_int_value_is_set")
+ !f>     import c_ptr, c_int, c_char
+ !f>     type(c_ptr), value                    :: index
+ !f>     character(kind=c_char), intent(in)    :: name(*)
+ !f>     integer(kind=c_int)                   :: success
+ !f>   end function
+ !f> end interface
+ */
+int elpa_index_int_value_is_set(elpa_index_t index, char *name);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_get_int_loc_c(index, name) result(loc) bind(C, name="elpa_index_get_int_loc")
+ !f>     import c_ptr, c_char
+ !f>     type(c_ptr), value                 :: index
+ !f>     character(kind=c_char), intent(in) :: name(*)
+ !f>     type(c_ptr)                        :: loc
+ !f>   end function
+ !f> end interface
+ */
+int* elpa_index_get_int_loc(elpa_index_t index, char *name);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_get_double_value_c(index, name, success) result(value) bind(C, name="elpa_index_get_double_value")
+ !f>     import c_ptr, c_int, c_double, c_char
+ !f>     type(c_ptr), value                              :: index
+ !f>     character(kind=c_char), intent(in)              :: name(*)
+ !f>#ifdef USE_FORTRAN2008
+ !f>     integer(kind=c_int), intent(out), optional      :: success
+ !f>#else
+ !f>     integer(kind=c_int), intent(out)                :: success
+ !f>#endif
+ !f>     real(kind=c_double)                             :: value
+ !f>   end function
+ !f> end interface
+ */
+double elpa_index_get_double_value(elpa_index_t index, char *name, int *success);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_set_double_value_c(index, name, value) result(success) &
+ !f>       bind(C, name="elpa_index_set_double_value")
+ !f>     import c_ptr, c_int, c_double, c_char
+ !f>     type(c_ptr), value                    :: index
+ !f>     character(kind=c_char), intent(in)    :: name(*)
+ !f>     real(kind=c_double),intent(in), value :: value
+ !f>     integer(kind=c_int)                   :: success
+ !f>   end function
+ !f> end interface
+ */
+int elpa_index_set_double_value(elpa_index_t index, char *name, double value);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_double_value_is_set_c(index, name) result(success) &
+ !f>       bind(C, name="elpa_index_double_value_is_set")
+ !f>     import c_ptr, c_int, c_char
+ !f>     type(c_ptr), value                    :: index
+ !f>     character(kind=c_char), intent(in)    :: name(*)
+ !f>     integer(kind=c_int)                   :: success
+ !f>   end function
+ !f> end interface
+ */
+int elpa_index_double_value_is_set(elpa_index_t index, char *name);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_get_double_loc_c(index, name) result(loc) bind(C, name="elpa_index_get_double_loc")
+ !f>     import c_ptr, c_char
+ !f>     type(c_ptr), value                 :: index
+ !f>     character(kind=c_char), intent(in) :: name(*)
+ !f>     type(c_ptr)                        :: loc
+ !f>   end function
+ !f> end interface
+ */
+double* elpa_index_get_double_loc(elpa_index_t index, char *name);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_value_is_set_c(index, name) result(success) bind(C, name="elpa_index_value_is_set")
+ !f>     import c_ptr, c_int, c_char
+ !f>     type(c_ptr), value                    :: index
+ !f>     character(kind=c_char), intent(in)    :: name(*)
+ !f>     integer(kind=c_int)                   :: success
+ !f>   end function
+ !f> end interface
+ */
+int elpa_index_value_is_set(elpa_index_t index, char *name);
+
+
+/*
+ !pf> interface
+ !pf>   function elpa_int_value_to_string_c(name, value, string) &
+ !pf>              result(error) bind(C, name="elpa_int_value_to_string")
+ !pf>     import c_int, c_ptr, c_char
+ !pf>     character(kind=c_char), intent(in) :: name(*)
+ !pf>     integer(kind=c_int), intent(in), value :: value
+ !pf>     type(c_ptr), intent(out) :: string
+ !pf>     integer(kind=c_int) :: error
+ !pf>   end function
+ !pf> end interface
+ !pf>
+ */
+int elpa_int_value_to_string(char *name, int value, const char **string);
+
+
+/*
+ !pf> interface
+ !pf>   pure function elpa_int_value_to_strlen_c(name, value) &
+ !pf>                   result(length) bind(C, name="elpa_int_value_to_strlen")
+ !pf>     import c_int, c_ptr, c_char
+ !pf>     character(kind=c_char), intent(in) :: name(*)
+ !pf>     integer(kind=c_int), intent(in), value :: value
+ !pf>     integer(kind=c_int) :: length
+ !pf>   end function
+ !pf> end interface
+ !pf>
+ */
+int elpa_int_value_to_strlen(char *name, int value);
+
+
+/*
+ !f> interface
+ !f>   pure function elpa_index_int_value_to_strlen_c(index, name) &
+ !f>                   result(length) bind(C, name="elpa_index_int_value_to_strlen")
+ !f>     import c_int, c_ptr, c_char
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     character(kind=c_char), intent(in) :: name(*)
+ !f>     integer(kind=c_int) :: length
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_int_value_to_strlen(elpa_index_t index, char *name);
+
+
+/*
+ !f> interface
+ !f>   function elpa_int_string_to_value_c(name, string, value) result(error) bind(C, name="elpa_int_string_to_value")
+ !f>     import c_int, c_ptr, c_char
+ !f>     character(kind=c_char), intent(in) :: name(*)
+ !f>     character(kind=c_char), intent(in) :: string(*)
+ !f>     integer(kind=c_int), intent(out) :: value
+ !f>     integer(kind=c_int) :: error
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_int_string_to_value(char *name, char *string, int *value);
+
+
+/*
+ !f> interface
+ !f>   function elpa_option_cardinality_c(name) result(n) bind(C, name="elpa_option_cardinality")
+ !f>     import c_int, c_char
+ !f>     character(kind=c_char), intent(in) :: name(*)
+ !f>     integer(kind=c_int) :: n
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_option_cardinality(char *name);
+
+/*
+ !f> interface
+ !f>   function elpa_option_enumerate_c(name, i) result(value) bind(C, name="elpa_option_enumerate")
+ !f>     import c_int, c_char
+ !f>     character(kind=c_char), intent(in) :: name(*)
+ !f>     integer(kind=c_int), intent(in), value :: i
+ !f>     integer(kind=c_int) :: value
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_option_enumerate(char *name, int i);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_int_is_valid_c(index, name, new_value) result(success) &
+ !f>       bind(C, name="elpa_index_int_is_valid")
+ !f>     import c_int, c_ptr, c_char
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     character(kind=c_char), intent(in) :: name(*)
+ !f>     integer(kind=c_int), intent(in), value :: new_value
+ !f>     integer(kind=c_int) :: success
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_int_is_valid(elpa_index_t index, char *name, int new_value);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_autotune_cardinality_c(index, autotune_level, autotune_domain) result(n) &
+ !f>       bind(C, name="elpa_index_autotune_cardinality")
+ !f>     import c_int, c_ptr, c_char
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     integer(kind=c_int), intent(in), value :: autotune_level, autotune_domain
+ !f>     integer(kind=c_int) :: n
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_autotune_cardinality(elpa_index_t index, int autotune_level, int autotune_domain);
+
+
+/*
+ !f> interface
+ !f>   function elpa_index_set_autotune_parameters_c(index, autotune_level, autotune_domain, n) result(success) &
+ !f>       bind(C, name="elpa_index_set_autotune_parameters")
+ !f>     import c_int, c_ptr, c_char
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     integer(kind=c_int), intent(in), value :: autotune_level, autotune_domain, n
+ !f>     integer(kind=c_int) :: success
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_set_autotune_parameters(elpa_index_t index, int autotune_level, int autotune_domain, int n);
+
+/*
+ !f> interface
+ !f>   function elpa_index_print_autotune_parameters_c(index, autotune_level, autotune_domain) result(success) &
+ !f>       bind(C, name="elpa_index_print_autotune_parameters")
+ !f>     import c_int, c_ptr, c_char
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     integer(kind=c_int), intent(in), value :: autotune_level, autotune_domain
+ !f>     integer(kind=c_int) :: success
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_print_autotune_parameters(elpa_index_t index, int autotune_level, int autotune_domain);
+
+/*
+ !f> interface
+ !f>   function elpa_index_print_settings_c(index, file_name) result(success) &
+ !f>       bind(C, name="elpa_index_print_settings")
+ !f>     import c_int, c_ptr, c_char
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     character(kind=c_char), intent(in)     :: file_name(*)
+ !f>     integer(kind=c_int) :: success
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_print_settings(elpa_index_t index, char* filename);
+
+/*
+ !f> interface
+ !f>   function elpa_index_load_settings_c(index, file_name) result(success) &
+ !f>       bind(C, name="elpa_index_load_settings")
+ !f>     import c_int, c_ptr, c_char
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     character(kind=c_char), intent(in)     :: file_name(*)
+ !f>     integer(kind=c_int) :: success
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_load_settings(elpa_index_t index, char* filename);
+
+/*
+ !f> interface
+ !f>   function elpa_index_print_autotune_state_c(index, autotune_level, autotune_domain, min_loc, &
+ !f>                                              min_val, current, cardinality, file_name) result(success) &
+ !f>       bind(C, name="elpa_index_print_autotune_state")
+ !f>     import c_int, c_ptr, c_char, c_double
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     integer(kind=c_int), intent(in), value :: autotune_level, autotune_domain, min_loc, current, cardinality
+ !f>     real(kind=c_double), intent(in), value :: min_val
+ !f>     character(kind=c_char), intent(in)     :: file_name(*)
+ !f>     integer(kind=c_int) :: success
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_print_autotune_state(elpa_index_t index, int autotune_level, int autotune_domain, int min_loc,
+                                    double min_val, int current, int cardinality, char* filename);
+
+/*
+ !f> interface
+ !f>   function elpa_index_load_autotune_state_c(index, autotune_level, autotune_domain, min_loc, &
+ !f>                                              min_val, current, cardinality, file_name) result(success) &
+ !f>       bind(C, name="elpa_index_load_autotune_state")
+ !f>     import c_int, c_ptr, c_char, c_double
+ !f>     type(c_ptr), intent(in), value :: index
+ !f>     integer(kind=c_int), intent(in) :: autotune_level, autotune_domain, min_loc, current, cardinality
+ !f>     real(kind=c_double), intent(in) :: min_val
+ !f>     character(kind=c_char), intent(in)     :: file_name(*)
+ !f>     integer(kind=c_int) :: success
+ !f>   end function
+ !f> end interface
+ !f>
+ */
+int elpa_index_load_autotune_state(elpa_index_t index, int* autotune_level, int* autotune_domain, int* min_loc,
+                                    double* min_val, int* current, int* cardinality, char* filename);
+
+int elpa_index_is_printing_mpi_rank(elpa_index_t index);
diff -Nru elpa-2016.05.001/src/elpa_qr/elpa_pdgeqrf.F90 elpa-2019.11.001/src/elpa_qr/elpa_pdgeqrf.F90
--- elpa-2016.05.001/src/elpa_qr/elpa_pdgeqrf.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa_qr/elpa_pdgeqrf.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,2414 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-
-#include "config-f90.h"
-
-module elpa_pdgeqrf
-
-  use elpa1_compute
-  use elpa_pdlarfb
-  use qr_utils_mod
-  use elpa_mpi
-  implicit none
-
-  PRIVATE
-
-  public :: qr_pdgeqrf_2dcomm
-  public :: qr_pqrparam_init
-  public :: qr_pdlarfg2_1dcomm_check
-
-
-  contains
-
-     subroutine qr_pdgeqrf_2dcomm(a, lda, matrixCols, v, ldv, vmrCols, tau, lengthTau, t, ldt, colsT, &
-                                  work, workLength, lwork, m, n, mb, nb, rowidx, colidx, &
-                                  rev, trans, PQRPARAM, mpicomm_rows, mpicomm_cols, blockheuristic)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter   :: gmode_ = 1, rank_ = 2, eps_ = 3
-
-      ! input variables (local)
-      integer(kind=ik), intent(in)  :: lda, lwork, ldv, ldt, matrixCols, m, vmrCols, lengthTau, &
-                                       colsT, workLength
-
-      ! input variables (global)
-      integer(kind=ik)              :: n, mb, nb, rowidx, colidx, rev, trans, mpicomm_cols, mpicomm_rows
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)              :: PQRPARAM(*)
-      real(kind=rk)                 :: a(lda,*), v(ldv,*), tau(*), t(ldt,*), work(*)
-#else
-      integer(kind=ik)              :: PQRPARAM(1:11)
-      real(kind=rk)                 :: a(1:lda,1:matrixCols), v(1:ldv,1:vmrCols), tau(1:lengthTau), &
-                                       t(1:ldt,1:colsT), work(1:workLength)
-#endif
-      ! output variables (global)
-      real(kind=rk)                 :: blockheuristic(*)
-
-      ! input variables derived from PQRPARAM
-      integer(kind=ik)              :: updatemode,tmerge,size2d
-
-      ! local scalars
-      integer(kind=ik)              :: mpierr,mpirank_cols,broadcast_size,mpirank_rows
-      integer(kind=ik)              :: mpirank_cols_qr,mpiprocs_cols
-      integer(kind=ik)              :: lcols_temp,lcols,icol,lastcol
-      integer(kind=ik)              :: baseoffset,offset,idx,voffset
-      integer(kind=ik)              :: update_voffset,update_tauoffset
-      integer(kind=ik)              :: update_lcols
-      integer(kind=ik)              :: work_offset
-
-      real(kind=rk)                 :: dbroadcast_size(1),dtmat_bcast_size(1)
-      real(kind=rk)                 :: pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1)
-      integer(kind=ik)              :: temptau_offset,temptau_size,broadcast_offset,tmat_bcast_size
-      integer(kind=ik)              :: remaining_cols
-      integer(kind=ik)              :: total_cols
-      integer(kind=ik)              :: incremental_update_size ! needed for incremental update mode
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdgeqrf_2dcomm")
-#endif
-      size2d     = PQRPARAM(1)
-      updatemode = PQRPARAM(2)
-      tmerge     = PQRPARAM(3)
-
-      ! copy value before we are going to filter it
-      total_cols = n
-      call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr)
-      call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr)
-      call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
-
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      call qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
-                             PQRPARAM(4),mpicomm_rows,blockheuristic)
-#else
-      call qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, &
-                             PQRPARAM(4:11),mpicomm_rows,blockheuristic)
-#endif
-      call qr_pdgeqrf_pack_unpack(v,ldv,dbroadcast_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,0,mpicomm_rows)
-      call qr_pdgeqrf_pack_unpack_tmatrix(tau,t,ldt,dtmat_bcast_size(1),-1,total_cols,0)
-
-      pdlarft_size(1) = 0.0d0
-
-      call qr_pdlarfb_1dcomm(m,mb,total_cols,total_cols,a,lda,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows, &
-                             pdlarfb_size(1),-1)
-      call qr_tmerge_pdlarfb_1dcomm(m,mb,total_cols,total_cols,total_cols,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode, &
-                                    mpicomm_rows,tmerge_pdlarfb_size(1),-1)
-
-
-      temptau_offset = 1
-      temptau_size = total_cols
-      broadcast_offset = temptau_offset + temptau_size
-      broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
-      work_offset = broadcast_offset + broadcast_size
-
-      if (lwork .eq. -1) then
-        work(1) = (DBLE(temptau_size) + DBLE(broadcast_size) + max(pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1), &
-                   tmerge_pdlarfb_size(1)))
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqrf_2dcomm")
-#endif
-        return
-      end if
-
-      lastcol = colidx-total_cols+1
-      voffset = total_cols
-
-      incremental_update_size = 0
-
-      ! clear v buffer: just ensure that there is no junk in the upper triangle
-      ! part, otherwise pdlarfb gets some problems
-      ! pdlarfl(2) do not have these problems as they are working more on a vector
-      ! basis
-      v(1:ldv,1:total_cols) = 0.0d0
-
-      icol = colidx
-
-      remaining_cols = total_cols
-
-      !print *,'start decomposition',m,rowidx,colidx
-
-      do while (remaining_cols .gt. 0)
-
-        ! determine rank of process column with next qr block
-        mpirank_cols_qr = MOD((icol-1)/nb,mpiprocs_cols)
-
-        ! lcols can't be larger than than nb
-        ! exception: there is only one process column
-
-        ! however, we might not start at the first local column.
-        ! therefore assume a matrix of size (1xlcols) starting at (1,icol)
-        ! determine the real amount of local columns
-        lcols_temp = min(nb,(icol-lastcol+1))
-
-        ! blocking parameter
-        lcols_temp = max(min(lcols_temp,size2d),1)
-
-        ! determine size from last decomposition column
-        !  to first decomposition column
-        call local_size_offset_1d(icol,nb,icol-lcols_temp+1,icol-lcols_temp+1,0, &
-                                      mpirank_cols_qr,mpiprocs_cols, &
-                                      lcols,baseoffset,offset)
-
-        voffset = remaining_cols - lcols + 1
-
-        idx = rowidx - colidx + icol
-
-        if (mpirank_cols .eq. mpirank_cols_qr) then
-          ! qr decomposition part
-
-          tau(offset:offset+lcols-1) = 0.0d0
-
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-          call qr_pdgeqrf_1dcomm(a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
-                                 work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4), &
-                                 mpicomm_rows,blockheuristic)
-
-#else
-          call qr_pdgeqrf_1dcomm(a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, &
-                                 work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4:11), &
-                                 mpicomm_rows,blockheuristic)
-#endif
-
-          ! pack broadcast buffer (v + tau)
-          call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,&
-                                      idx,rev,0,mpicomm_rows)
-
-          ! determine broadcast size
-          call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,&
-                                      0,mpicomm_rows)
-          broadcast_size = dbroadcast_size(1)
-
-          !if (mpirank_rows .eq. 0) then
-          ! pack tmatrix into broadcast buffer and calculate new size
-          call qr_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt, &
-                                              work(broadcast_offset+broadcast_size),lwork,lcols,0)
-          call qr_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0)
-          broadcast_size = broadcast_size + dtmat_bcast_size(1)
-          !end if
-
-          ! initiate broadcast (send part)
-#ifdef WITH_MPI
-          call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
-                         mpirank_cols_qr,mpicomm_cols,mpierr)
-#endif
-          ! copy tau parts into temporary tau buffer
-          work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1)
-
-          !print *,'generated tau:', tau(offset)
-        else
-          ! vector exchange part
-
-          ! determine broadcast size
-          call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
-          broadcast_size = dbroadcast_size(1)
-
-          call qr_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
-                                              dtmat_bcast_size(1),-1,lcols,0)
-          tmat_bcast_size = dtmat_bcast_size(1)
-
-          !print *,'broadcast_size (nonqr)',broadcast_size
-          broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1)
-
-          ! initiate broadcast (recv part)
-#ifdef WITH_MPI
-          call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, &
-                         mpirank_cols_qr,mpicomm_cols,mpierr)
-#endif
-          ! last n*n elements in buffer are (still empty) T matrix elements
-          ! fetch from first process in each column
-
-          ! unpack broadcast buffer (v + tau)
-          call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows)
-
-          ! now send t matrix to other processes in our process column
-          broadcast_size = dbroadcast_size(1)
-          tmat_bcast_size = dtmat_bcast_size(1)
-
-          ! t matrix should now be available on all processes => unpack
-          call qr_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt, &
-                                              work(broadcast_offset+broadcast_size),lwork,lcols,1)
-        end if
-
-        remaining_cols = remaining_cols - lcols
-
-        ! apply householder vectors to whole trailing matrix parts (if any)
-
-        update_voffset = voffset
-        update_tauoffset = icol
-        update_lcols = lcols
-        incremental_update_size = incremental_update_size + lcols
-
-        icol = icol - lcols
-        ! count colums from first column of global block to current index
-        call local_size_offset_1d(icol,nb,colidx-n+1,colidx-n+1,0, &
-                                      mpirank_cols,mpiprocs_cols, &
-                                      lcols,baseoffset,offset)
-
-        if (lcols .gt. 0) then
-
-          !print *,'updating trailing matrix'
-
-       	  if (updatemode .eq. ichar('I')) then
-       	    print *,'pdgeqrf_2dcomm: incremental update not yet implemented! rev=1'
-       	  else if (updatemode .eq. ichar('F')) then
-       	    ! full update no merging
-       	    call qr_pdlarfb_1dcomm(m,mb,lcols,update_lcols,a(1,offset),lda,v(1,update_voffset),ldv, &
-         						work(temptau_offset+update_voffset-1),                          &
-                                                        t(update_voffset,update_voffset),ldt, &
-        						rowidx,idx,1,mpicomm_rows,work(work_offset),lwork)
-       	  else
-            ! full update + merging default
-       	    call qr_tmerge_pdlarfb_1dcomm(m,mb,lcols,n-(update_voffset+update_lcols-1),update_lcols, &
-                                                              v(1,update_voffset),ldv, &
-        						      t(update_voffset,update_voffset),ldt, &
-        						      a(1,offset),lda,rowidx,1,updatemode,mpicomm_rows, &
-                                                              work(work_offset),lwork)
-       	  end if
-        else
-       	  if (updatemode .eq. ichar('I')) then
-       	    print *,'sole merging of (incremental) T matrix', mpirank_cols,  &
-                                        n-(update_voffset+incremental_update_size-1)
-       	    call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+incremental_update_size-1),   &
-                                                              incremental_update_size,v(1,update_voffset),ldv, &
-        						      t(update_voffset,update_voffset),ldt, &
-        						      a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
-
-       	    ! reset for upcoming incremental updates
-       	    incremental_update_size = 0
-          else if (updatemode .eq. ichar('M')) then
-       	    ! final merge
-            call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+update_lcols-1),update_lcols, &
-                                                              v(1,update_voffset),ldv, &
-        						      t(update_voffset,update_voffset),ldt, &
-        						      a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork)
-          else
-            ! full updatemode - nothing to update
-          end if
-
-          ! reset for upcoming incremental updates
-          incremental_update_size = 0
-        end if
-      end do
-
-      if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
-        ! finally merge all small T parts
-        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size2d,tmerge,v,ldv,t,ldt,rowidx,rev,mpicomm_rows,work,lwork)
-      end if
-
-      !print *,'stop decomposition',rowidx,colidx
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdgeqrf_2dcomm")
-#endif
-    end subroutine qr_pdgeqrf_2dcomm
-
-    subroutine qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic)
-      use precision
-      use ELPA1
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter  :: gmode_ = 1,rank_ = 2,eps_ = 3
-
-      ! input variables (local)
-      integer(kind=ik)             :: lda,lwork,ldv,ldt
-      real(kind=rk)                :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
-
-      ! input variables (global)
-      integer(kind=ik)             :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)             :: PQRPARAM(*)
-
-#else
-      integer(kind=ik)             :: PQRPARAM(:)
-#endif
-      ! derived input variables
-
-      ! derived further input variables from QR_PQRPARAM
-      integer(kind=ik)             :: size1d,updatemode,tmerge
-
-      ! output variables (global)
-      real(kind=rk)                :: blockheuristic(*)
-
-      ! local scalars
-      integer(kind=ik)             :: nr_blocks,remainder,current_block,aoffset,idx,updatesize
-      real(kind=rk)                :: pdgeqr2_size(1),pdlarfb_size(1),tmerge_tree_size(1)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdgeqrf_1dcomm")
-#endif
-      size1d     = max(min(PQRPARAM(1),n),1)
-      updatemode = PQRPARAM(2)
-      tmerge     = PQRPARAM(3)
-
-      if (lwork .eq. -1) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        call qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
-                                  m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4),mpicomm,blockheuristic)
-#else
-        call qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, &
-                                  m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4:),mpicomm,blockheuristic)
-#endif
-        ! reserve more space for incremental mode
-        call qr_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,t,ldt, &
-                                         a,lda,baseidx,rev,updatemode,mpicomm,pdlarfb_size,-1)
-
-        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,tmerge_tree_size,-1)
-
-        work(1) = max(pdlarfb_size(1),pdgeqr2_size(1),tmerge_tree_size(1))
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqrf_1dcomm")
-#endif
-        return
-      end if
-
-      nr_blocks = n / size1d
-      remainder = n - nr_blocks*size1d
-
-      current_block = 0
-      do while (current_block .lt. nr_blocks)
-        idx = rowidx-current_block*size1d
-        updatesize = n-(current_block+1)*size1d
-        aoffset = 1+updatesize
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
-                                m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
-
-#else
-        call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, &
-                                m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
-#endif
-        if (updatemode .eq. ichar('M')) then
-          ! full update + merging
-          call qr_tmerge_pdlarfb_1dcomm(m,mb,updatesize,current_block*size1d,size1d, &
-                                           v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
-                                           a,lda,baseidx,1,ichar('F'),mpicomm,work,lwork)
-        else if (updatemode .eq. ichar('I')) then
-          if (updatesize .ge. size1d) then
-            ! incremental update + merging
-            call qr_tmerge_pdlarfb_1dcomm(m,mb,size1d,current_block*size1d,size1d, &
-                                               v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
-                                               a(1,aoffset-size1d),lda,baseidx,1,updatemode,mpicomm,work,lwork)
-
-          else ! only remainder left
-             ! incremental update + merging
-             call qr_tmerge_pdlarfb_1dcomm(m,mb,remainder,current_block*size1d,size1d, &
-                                               v(1,aoffset),ldv,t(aoffset,aoffset),ldt, &
-                                               a(1,1),lda,baseidx,1,updatemode,mpicomm,work,lwork)
-          end if
-        else ! full update no merging is default
-          ! full update no merging
-          call qr_pdlarfb_1dcomm(m,mb,updatesize,size1d,a,lda,v(1,aoffset),ldv, &
-                                    tau(aoffset),t(aoffset,aoffset),ldt,baseidx,idx,1,mpicomm,work,lwork)
-        end if
-
-        ! move on to next block
-        current_block = current_block+1
-      end do
-
-      if (remainder .gt. 0) then
-        aoffset = 1
-        idx = rowidx-size1d*nr_blocks
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
-                                  m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic)
-
-#else
-        call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, &
-                                  m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic)
-#endif
-        if ((updatemode .eq. ichar('I')) .or. (updatemode .eq. ichar('M'))) then
-          ! final merging
-          call qr_tmerge_pdlarfb_1dcomm(m,mb,0,size1d*nr_blocks,remainder, &
-                                             v,ldv,t,ldt, &
-                                             a,lda,baseidx,1,updatemode,mpicomm,work,lwork) ! updatemode argument does not matter
-        end if
-      end if
-
-      if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then
-        ! finally merge all small T parts
-        call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqrf_1dcomm")
-#endif
-
-    end subroutine qr_pdgeqrf_1dcomm
-
-    ! local a and tau are assumed to be positioned at the right column from a local
-    ! perspective
-    ! TODO: if local amount of data turns to zero the algorithm might produce wrong
-    ! results (probably due to old buffer contents)
-    subroutine qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic)
-      use precision
-      use ELPA1
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter   :: gmode_ = 1,rank_ = 2 ,eps_ = 3, upmode1_ = 4
-
-      ! input variables (local)
-      integer(kind=ik)              :: lda,lwork,ldv,ldt
-      real(kind=rk)                 :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*)
-
-      ! input variables (global)
-      integer(kind=ik)              :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)              :: PQRPARAM(*)
-#else
-      integer(kind=ik)              :: PQRPARAM(:)
-#endif
-      ! output variables (global)
-      real(kind=rk)                 :: blockheuristic(*)
-
-      ! derived further input variables from QR_PQRPARAM
-      integer(kind=ik)              ::  maxrank,hgmode,updatemode
-
-      ! local scalars
-      integer(kind=ik)              :: icol,incx,idx
-      real(kind=rk)                 :: pdlarfg_size(1),pdlarf_size(1),total_size
-      real(kind=rk)                 :: pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1)
-      real(kind=rk)                 :: pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1)
-      integer(kind=ik)              :: mpirank,mpiprocs,mpierr
-      integer(kind=ik)              :: rank,lastcol,actualrank,nextrank
-      integer(kind=ik)              :: update_cols,decomposition_cols
-      integer(kind=ik)              :: current_column
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdgeqr2_1dcomm")
-#endif
-
-      maxrank    = min(PQRPARAM(1),n)
-      updatemode = PQRPARAM(2)
-      hgmode     = PQRPARAM(4)
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-      if (trans .eq. 1) then
-        incx = lda
-      else
-        incx = 1
-      end if
-
-      if (lwork .eq. -1) then
-        call qr_pdlarfg_1dcomm(a,incx,tau(1),pdlarfg_size(1),-1,n,rowidx,mb,hgmode,rev,mpicomm)
-        call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        call qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM, &
-                                    rev,mpicomm,actualrank)
-
-        call qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank)
-
-#else
-        call qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM(:), &
-                                    rev,mpicomm,actualrank)
-
-        call qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM(:),rev,mpicomm,actualrank)
-#endif
-        call qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,pdlarfl2_size(1),-1,m,n,rowidx,mb,rev,mpicomm)
-        pdlarft_size(1) = 0.0d0
-        call qr_pdlarfb_1dcomm(m,mb,n,n,a,lda,v,ldv,tau,t,ldt,baseidx,rowidx,1,mpicomm,pdlarfb_size(1),-1)
-        pdlarft_pdlarfb_size(1) = 0.0d0
-        call qr_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode,mpicomm,tmerge_pdlarfb_size(1),-1)
-
-        total_size = max(pdlarfg_size(1),pdlarf_size(1),pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1),pdlarft_size(1), &
-                         pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1))
-
-        work(1) = total_size
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqr2_1dcomm")
-#endif
-        return
-      end if
-
-      icol = 1
-      lastcol = min(rowidx,n)
-      decomposition_cols = lastcol
-      update_cols = n
-      do while (decomposition_cols .gt. 0) ! local qr block
-        icol = lastcol-decomposition_cols+1
-        idx = rowidx-icol+1
-
-        ! get possible rank size
-        ! limited by number of columns and remaining rows
-        rank = min(n-icol+1,maxrank,idx)
-
-        current_column = n-icol+1-rank+1
-
-        if (rank .eq. 1) then
-
-          call qr_pdlarfg_1dcomm(a(1,current_column),incx, &
-                                  tau(current_column),work,lwork, &
-                                  m,idx,mb,hgmode,1,mpicomm)
-
-          v(1:ldv,current_column) = 0.0d0
-          call qr_pdlarfg_copy_1dcomm(a(1,current_column),incx, &
-                                       v(1,current_column),1, &
-                                       m,baseidx,idx,mb,1,mpicomm)
-
-          ! initialize t matrix part
-          t(current_column,current_column) = tau(current_column)
-
-          actualrank = 1
-
-        else if (rank .eq. 2) then
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-          call qr_pdlarfg2_1dcomm_ref(a(1,current_column),lda,tau(current_column), &
-                                         t(current_column,current_column),ldt,v(1,current_column),ldv, &
-                                        baseidx,work,lwork,m,idx,mb,PQRPARAM,1,mpicomm,actualrank)
-
-#else
-          call qr_pdlarfg2_1dcomm_ref(a(1,current_column),lda,tau(current_column), &
-                                         t(current_column,current_column),ldt,v(1,current_column),ldv, &
-                                        baseidx,work,lwork,m,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
-#endif
-        else
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-          call qr_pdlarfgk_1dcomm(a(1,current_column),lda,tau(current_column), &
-                                     t(current_column,current_column),ldt,v(1,current_column),ldv, &
-                                     baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank)
-
-#else
-          call qr_pdlarfgk_1dcomm(a(1,current_column),lda,tau(current_column), &
-                                     t(current_column,current_column),ldt,v(1,current_column),ldv, &
-                                     baseidx,work,lwork,m,rank,idx,mb,PQRPARAM(:),1,mpicomm,actualrank)
-#endif
-        end if
-
-        blockheuristic(actualrank) = blockheuristic(actualrank) + 1
-
-        ! the blocked decomposition versions already updated their non
-        ! decomposed parts using their information after communication
-        update_cols = decomposition_cols - rank
-        decomposition_cols = decomposition_cols - actualrank
-
-        ! needed for incremental update
-        nextrank = min(n-(lastcol-decomposition_cols+1)+1,maxrank,rowidx-(lastcol-decomposition_cols+1)+1)
-
-        if (current_column .gt. 1) then
-          idx = rowidx-icol+1
-
-          if (updatemode .eq. ichar('I')) then
-            ! incremental update + merging
-            call qr_tmerge_pdlarfb_1dcomm(m,mb,nextrank-(rank-actualrank),n-(current_column+rank-1),actualrank, &
-                                          v(1,current_column+(rank-actualrank)),ldv, &
-                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
-                                          a(1,current_column-nextrank+(rank-actualrank)),lda,baseidx,rev,updatemode,&
-                                          mpicomm,work,lwork)
-          else
-            ! full update + merging
-            call qr_tmerge_pdlarfb_1dcomm(m,mb,update_cols,n-(current_column+rank-1),actualrank, &
-                                          v(1,current_column+(rank-actualrank)),ldv, &
-                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
-                                          a(1,1),lda,baseidx,rev,updatemode,mpicomm,work,lwork)
-          end if
-        else
-          call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(current_column+rank-1),actualrank,v(1,current_column+(rank-actualrank)), &
-                                          ldv, &
-                                          t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, &
-                                           a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
-        end if
-
-      end do
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdgeqr2_1dcomm")
-#endif
-    end subroutine qr_pdgeqr2_1dcomm
-
-    ! incx == 1: column major
-    ! incx != 1: row major
-    subroutine qr_pdlarfg_1dcomm(x,incx,tau,work,lwork,n,idx,nb,hgmode,rev,mpi_comm)
-
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2, eps_ = 3
-
-      ! input variables (local)
-      integer(kind=ik)               :: incx,lwork,hgmode
-      real(kind=rk)                  :: x(*),work(*)
-
-      ! input variables (global)
-      integer(kind=ik)               :: mpi_comm,nb,idx,n,rev
-
-      ! output variables (global)
-      real(kind=rk)                  :: tau
-
-      ! local scalars
-      integer(kind=ik)               :: mpierr,mpirank,mpiprocs,mpirank_top
-      integer(kind=ik)               :: sendsize,recvsize
-      integer(kind=ik)               :: local_size,local_offset,baseoffset
-      integer(kind=ik)               :: topidx,top,iproc
-      real(kind=rk)                  :: alpha,xnorm,dot,xf
-
-      ! external functions
-      real(kind=rk), external        :: ddot,dlapy2,dnrm2
-      external                       :: dscal
-
-      ! intrinsic
-!      intrinsic sign
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg_1dcomm")
-#endif
-      if (idx .le. 1) then
-        tau = 0.0d0
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg_1dcomm")
-#endif
-        return
-       end if
-      call MPI_Comm_rank(mpi_comm, mpirank, mpierr)
-      call MPI_Comm_size(mpi_comm, mpiprocs, mpierr)
-      ! calculate expected work size and store in work(1)
-      if (hgmode .eq. ichar('s')) then
-        ! allreduce (MPI_SUM)
-        sendsize = 2
-        recvsize = sendsize
-      else if (hgmode .eq. ichar('x')) then
-        ! alltoall
-        sendsize = mpiprocs*2
-        recvsize = sendsize
-      else if (hgmode .eq. ichar('g')) then
-        ! allgather
-        sendsize = 2
-        recvsize = mpiprocs*sendsize
-      else
-        ! no exchange at all (benchmarking)
-        sendsize = 2
-        recvsize = sendsize
-      end if
-
-      if (lwork .eq. -1) then
-        work(1) = DBLE(sendsize + recvsize)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg_1dcomm")
-#endif
-        return
-      end if
-
-      ! Processor id for global index of top element
-      mpirank_top = MOD((idx-1)/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
-        top = 1+(topidx-1)*incx
-      end if
-
-      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
-          						  local_size,baseoffset,local_offset)
-
-      local_offset = local_offset * incx
-
-      ! calculate and exchange information
-      if (hgmode .eq. ichar('s')) then
-        if (mpirank .eq. mpirank_top) then
-          alpha = x(top)
-        else
-          alpha = 0.0d0
-        end if
-
-        dot = ddot(local_size, &
-                     x(local_offset), incx, &
-                     x(local_offset), incx)
-
-        work(1) = alpha
-        work(2) = dot
-#ifdef WITH_MPI
-        call mpi_allreduce(work(1),work(sendsize+1), &
-                             sendsize,mpi_real8,mpi_sum, &
-                             mpi_comm,mpierr)
-#else
-        work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
-#endif
-        alpha = work(sendsize+1)
-        xnorm = sqrt(work(sendsize+2))
-      else if (hgmode .eq. ichar('x')) then
-        if (mpirank .eq. mpirank_top) then
-          alpha = x(top)
-        else
-          alpha = 0.0d0
-        end if
-
-        xnorm = dnrm2(local_size, x(local_offset), incx)
-
-        do iproc=0,mpiprocs-1
-          work(2*iproc+1) = alpha
-          work(2*iproc+2) = xnorm
-        end do
-#ifdef WITH_MPI
-        call mpi_alltoall(work(1),2,mpi_real8, &
-                            work(sendsize+1),2,mpi_real8, &
-                            mpi_comm,mpierr)
-#else 
-        work(sendsize+1:sendsize+1+2-1) = work(1:2)
-#endif
-        ! extract alpha value
-        alpha = work(sendsize+1+mpirank_top*2)
-
-        ! copy norm parts of buffer to beginning
-        do iproc=0,mpiprocs-1
-          work(iproc+1) = work(sendsize+1+2*iproc+1)
-        end do
-
-        xnorm = dnrm2(mpiprocs, work(1), 1)
-      else if (hgmode .eq. ichar('g')) then
-        if (mpirank .eq. mpirank_top) then
-          alpha = x(top)
-        else
-          alpha = 0.0d0
-        end if
-
-        xnorm = dnrm2(local_size, x(local_offset), incx)
-        work(1) = alpha
-        work(2) = xnorm
-
-        ! allgather
-#ifdef WITH_MPI
-        call mpi_allgather(work(1),sendsize,mpi_real8, &
-                            work(sendsize+1),sendsize,mpi_real8, &
-                            mpi_comm,mpierr)
-#else
-       work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
-#endif
-        ! extract alpha value
-        alpha = work(sendsize+1+mpirank_top*2)
-
-        ! copy norm parts of buffer to beginning
-        do iproc=0,mpiprocs-1
-          work(iproc+1) = work(sendsize+1+2*iproc+1)
-        end do
-
-        xnorm = dnrm2(mpiprocs, work(1), 1)
-      else
-        ! dnrm2
-        xnorm = dnrm2(local_size, x(local_offset), incx)
-
-        if (mpirank .eq. mpirank_top) then
-          alpha = x(top)
-        else
-          alpha = 0.0d0
-        end if
-
-        ! no exchange at all (benchmarking)
-
-        xnorm = 0.0d0
-      end if
-
-      !print *,'ref hg:', idx,xnorm,alpha
-      !print *,x(1:n)
-
-      ! calculate householder information
-      if (xnorm .eq. 0.0d0) then
-        ! H = I
-
-        tau = 0.0d0
-      else
-        ! General case
-
-        call hh_transform_real(alpha,xnorm**2,xf,tau)
-        if (mpirank .eq. mpirank_top) then
-          x(top) = alpha
-        end if
-
-        call dscal(local_size, xf, &
-                     x(local_offset), incx)
-
-        ! TODO: reimplement norm rescale method of
-        ! original PDLARFG using mpi?
-
-      end if
-
-      ! useful for debugging
-      !print *,'hg:mpirank,idx,beta,alpha:',mpirank,idx,beta,alpha,1.0d0/(beta+alpha),tau
-      !print *,x(1:n)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg_1dcomm")
-#endif
-    end subroutine qr_pdlarfg_1dcomm
-
-    subroutine qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,idx,mb,PQRPARAM,rev,mpicomm,actualk)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter    :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
-      ! input variables (local)
-      integer(kind=ik)               :: lda,lwork,ldv,ldt
-      real(kind=rk)                  :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
-
-      ! input variables (global)
-      integer(kind=ik)               :: m,idx,baseidx,mb,rev,mpicomm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)               :: PQRPARAM(*)
-#else
-      integer(kind=ik)               :: PQRPARAM(:)
-#endif
-      ! output variables (global)
-      integer(kind=ik)               :: actualk
-
-      ! derived input variables from QR_PQRPARAM
-      integer(kind=ik)               :: eps
-
-      ! local scalars
-      real(kind=rk)                  :: dseedwork_size(1)
-      integer(kind=ik)               :: seedwork_size,seed_size
-      integer(kind=ik)               :: seedwork_offset,seed_offset
-      logical                        :: accurate
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm")
-#endif
-
-      call qr_pdlarfg2_1dcomm_seed(a,lda,dseedwork_size(1),-1,work,m,mb,idx,rev,mpicomm)
-      seedwork_size = dseedwork_size(1)
-      seed_size = seedwork_size
-
-      if (lwork .eq. -1) then
-        work(1) = seedwork_size + seed_size
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm")
-#endif
-
-        return
-      end if
-
-      seedwork_offset = 1
-      seed_offset = seedwork_offset + seedwork_size
-
-      eps = PQRPARAM(3)
-
-      ! check for border cases (only a 2x2 matrix left)
-      if (idx .le. 1) then
-      	tau(1:2) = 0.0d0
-       	t(1:2,1:2) = 0.0d0
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm")
-#endif
-
-       	return
-      end if
-
-      call qr_pdlarfg2_1dcomm_seed(a,lda,work(seedwork_offset),lwork,work(seed_offset),m,mb,idx,rev,mpicomm)
-
-      if (eps .gt. 0) then
-        accurate = qr_pdlarfg2_1dcomm_check(work(seed_offset),eps)
-      else
-        accurate = .true.
-      end if
-
-      call qr_pdlarfg2_1dcomm_vector(a(1,2),1,tau(2),work(seed_offset), &
-                                          m,mb,idx,0,1,mpicomm)
-
-      call qr_pdlarfg_copy_1dcomm(a(1,2),1, &
-                                       v(1,2),1, &
-                                       m,baseidx,idx,mb,1,mpicomm)
-
-      call qr_pdlarfg2_1dcomm_update(v(1,2),1,baseidx,a(1,1),lda,work(seed_offset),m,idx,mb,rev,mpicomm)
-
-      ! check for 2x2 matrix case => only one householder vector will be
-      ! generated
-      if (idx .gt. 2) then
-        if (accurate .eqv. .true.) then
-          call qr_pdlarfg2_1dcomm_vector(a(1,1),1,tau(1),work(seed_offset), &
-                                                  m,mb,idx-1,1,1,mpicomm)
-
-          call qr_pdlarfg_copy_1dcomm(a(1,1),1, &
-                                               v(1,1),1, &
-                                               m,baseidx,idx-1,mb,1,mpicomm)
-
-          ! generate fuse element
-          call qr_pdlarfg2_1dcomm_finalize_tmatrix(work(seed_offset),tau,t,ldt)
-
-          actualk = 2
-        else
-          t(1,1) = 0.0d0
-          t(1,2) = 0.0d0
-          t(2,2) = tau(2)
-
-          actualk = 1
-        end if
-      else
-        t(1,1) = 0.0d0
-        t(1,2) = 0.0d0
-        t(2,2) = tau(2)
-
-        ! no more vectors to create
-
-        tau(1) = 0.0d0
-
-        actualk = 2
-
-        !print *,'rank2: no more data'
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm")
-#endif
-
-    end subroutine qr_pdlarfg2_1dcomm_ref
-
-    subroutine qr_pdlarfg2_1dcomm_seed(a,lda,work,lwork,seed,n,nb,idx,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)        :: lda,lwork
-      real(kind=rk)           :: a(lda,*),work(*),seed(*)
-
-      ! input variables (global)
-      integer(kind=ik)        :: n,nb,idx,rev,mpicomm
-
-      ! output variables (global)
-
-      ! external functions
-      real(kind=rk), external :: ddot
-
-      ! local scalars
-      real(kind=rk)           :: top11,top21,top12,top22
-      real(kind=rk)           :: dot11,dot12,dot22
-      integer(kind=ik)        :: mpirank,mpiprocs,mpierr
-      integer(kind=ik)        :: mpirank_top11,mpirank_top21
-      integer(kind=ik)        :: top11_offset,top21_offset
-      integer(kind=ik)        :: baseoffset
-      integer(kind=ik)        :: local_offset1,local_size1
-      integer(kind=ik)        :: local_offset2,local_size2
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_seed")
-#endif
-
-      if (lwork .eq. -1) then
-        work(1) = DBLE(8)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_seed")
-#endif
-        return
-      end if
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
-                                local_size1,baseoffset,local_offset1)
-
-      call local_size_offset_1d(n,nb,idx,idx-2,rev,mpirank,mpiprocs, &
-                                local_size2,baseoffset,local_offset2)
-
-      mpirank_top11 = MOD((idx-1)/nb,mpiprocs)
-      mpirank_top21 = MOD((idx-2)/nb,mpiprocs)
-
-      top11_offset = local_index(idx,mpirank_top11,mpiprocs,nb,0)
-      top21_offset = local_index(idx-1,mpirank_top21,mpiprocs,nb,0)
-
-      if (mpirank_top11 .eq. mpirank) then
-        top11 = a(top11_offset,2)
-        top12 = a(top11_offset,1)
-      else
-        top11 = 0.0d0
-        top12 = 0.0d0
-      end if
-
-      if (mpirank_top21 .eq. mpirank) then
-        top21 = a(top21_offset,2)
-        top22 = a(top21_offset,1)
-      else
-        top21 = 0.0d0
-        top22 = 0.0d0
-      end if
-
-      ! calculate 3 dot products
-      dot11 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1)
-      dot12 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1)
-      dot22 = ddot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1)
-
-      ! store results in work buffer
-      work(1) = top11
-      work(2) = dot11
-      work(3) = top12
-      work(4) = dot12
-      work(5) = top21
-      work(6) = top22
-      work(7) = dot22
-      work(8) = 0.0d0 ! fill up buffer
-
-      ! exchange partial results
-#ifdef WITH_MPI
-      call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, &
-                         mpicomm, mpierr)
-#else
-      seed(1:8) = work(1:8)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_seed")
-#endif
-    end subroutine qr_pdlarfg2_1dcomm_seed
-
-    logical function qr_pdlarfg2_1dcomm_check(seed,eps)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables
-      real(kind=rk)    ::  seed(*)
-      integer(kind=ik) :: eps
-
-      ! local scalars
-      real(kind=rk)    :: epsd,first,second,first_second,estimate
-      logical          :: accurate
-      real(kind=rk)    :: dot11,dot12,dot22
-      real(kind=rk)    :: top11,top12,top21,top22
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_check")
-#endif
-
-      EPSD = EPS
-
-      top11 = seed(1)
-      dot11 = seed(2)
-      top12 = seed(3)
-      dot12 = seed(4)
-
-      top21 = seed(5)
-      top22 = seed(6)
-      dot22 = seed(7)
-
-      ! reconstruct the whole inner products
-      ! (including squares of the top elements)
-      first = dot11 + top11*top11
-      second = dot22 + top22*top22 + top12*top12
-      first_second = dot12 + top11*top12
-
-      ! zero Householder vector (zero norm) case
-      if (first*second .eq. 0.0d0) then
-        qr_pdlarfg2_1dcomm_check = .false.
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_check")
-#endif
-
-        return
-      end if
-
-      estimate = abs((first_second*first_second)/(first*second))
-
-      !print *,'estimate:',estimate
-
-      ! if accurate the following check holds
-      accurate = (estimate .LE. (epsd/(1.0d0+epsd)))
-
-      qr_pdlarfg2_1dcomm_check = accurate
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_check")
-#endif
-
-    end function qr_pdlarfg2_1dcomm_check
-
-    ! id=0: first vector
-    ! id=1: second vector
-    subroutine qr_pdlarfg2_1dcomm_vector(x,incx,tau,seed,n,nb,idx,id,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)        :: incx
-      real(kind=rk)           :: x(*),seed(*),tau
-
-      ! input variables (global)
-      integer(kind=ik)        :: n,nb,idx,id,rev,mpicomm
-
-      ! output variables (global)
-
-      ! external functions
-      real(kind=rk), external :: dlapy2
-      external                :: dscal
-
-      ! local scalars
-      integer(kind=ik)        :: mpirank,mpirank_top,mpiprocs,mpierr
-      real(kind=rk)           :: alpha,dot,beta,xnorm
-      integer(kind=ik)        :: local_size,baseoffset,local_offset,top,topidx
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_vector")
-#endif
-
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-      call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, &
-                                    local_size,baseoffset,local_offset)
-
-      local_offset = local_offset * incx
-
-      ! Processor id for global index of top element
-      mpirank_top = MOD((idx-1)/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        topidx = local_index(idx,mpirank_top,mpiprocs,nb,0)
-        top = 1+(topidx-1)*incx
-      end if
-
-      alpha = seed(id*5+1)
-      dot = seed(id*5+2)
-
-      xnorm = sqrt(dot)
-
-      if (xnorm .eq. 0.0d0) then
-        ! H = I
-
-        tau = 0.0d0
-      else
-        ! General case
-
-        beta = sign(dlapy2(alpha, xnorm), alpha)
-        tau = (beta+alpha) / beta
-
-        !print *,'hg2',tau,xnorm,alpha
-
-        call dscal(local_size, 1.0d0/(beta+alpha), &
-                   x(local_offset), incx)
-
-        ! TODO: reimplement norm rescale method of
-        ! original PDLARFG using mpi?
-
-        if (mpirank .eq. mpirank_top) then
-          x(top) = -beta
-        end if
-
-        seed(8) = beta
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_vector")
-#endif
-
-    end subroutine qr_pdlarfg2_1dcomm_vector
-
-    subroutine qr_pdlarfg2_1dcomm_update(v,incv,baseidx,a,lda,seed,n,idx,nb,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)   :: incv,lda
-      real(kind=rk)      :: v(*),a(lda,*),seed(*)
-
-      ! input variables (global)
-      integer(kind=ik)   :: n,baseidx,idx,nb,rev,mpicomm
-
-      ! output variables (global)
-
-      ! external functions
-      external daxpy
-
-      ! local scalars
-      integer(kind=ik)   :: mpirank,mpiprocs,mpierr
-      integer(kind=ik)   :: local_size,local_offset,baseoffset
-      real(kind=rk)      :: z,coeff,beta
-      real(kind=rk)      :: dot11,dot12,dot22
-      real(kind=rk)      :: top11,top12,top21,top22
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_update")
-#endif
-
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-
-      ! seed should be updated by previous householder generation
-      ! Update inner product of this column and next column vector
-      top11 = seed(1)
-      dot11 = seed(2)
-      top12 = seed(3)
-      dot12 = seed(4)
-
-      top21 = seed(5)
-      top22 = seed(6)
-      dot22 = seed(7)
-      beta = seed(8)
-
-      call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
-                                local_size,baseoffset,local_offset)
-      baseoffset = baseoffset * incv
-
-      ! zero Householder vector (zero norm) case
-      if (beta .eq. 0.0d0) then
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_update")
-#endif
-        return
-      end if
-      z = (dot12 + top11 * top12) / beta + top12
-
-      !print *,'hg2 update:',baseidx,idx,mpirank,local_size
-
-      call daxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1)
-
-      ! prepare a full dot22 for update
-      dot22 = dot22 + top22*top22
-
-      ! calculate coefficient
-      COEFF = z / (top11 + beta)
-
-      ! update inner product of next vector
-      dot22 = dot22 - coeff * (2*dot12 - coeff*dot11)
-
-      ! update dot12 value to represent update with first vector
-      ! (needed for T matrix)
-      dot12 = dot12 - COEFF * dot11
-
-      ! update top element of next vector
-      top22 = top22 - coeff * top21
-      seed(6) = top22
-
-      ! restore separated dot22 for vector generation
-      seed(7) = dot22  - top22*top22
-
-      !------------------------------------------------------
-      ! prepare elements for T matrix
-      seed(4) = dot12
-
-      ! prepare dot matrix for fuse element of T matrix
-      ! replace top11 value with -beta1
-      seed(1) = beta
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_update")
-#endif
-
-    end subroutine qr_pdlarfg2_1dcomm_update
-
-    ! run this function after second vector
-    subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix(seed,tau,t,ldt)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer(kind=ik)  :: ldt
-      real(kind=rk)     :: seed(*),t(ldt,*),tau(*)
-      real(kind=rk)     :: dot12,beta1,top21,beta2
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfg2_1dcomm_finalize_tmatrix")
-#endif
-
-      beta1 = seed(1)
-      dot12 = seed(4)
-      top21 = seed(5)
-      beta2 = seed(8)
-
-      !print *,'beta1 beta2',beta1,beta2
-
-      dot12 = dot12 / beta2 + top21
-      dot12 = -(dot12 / beta1)
-
-      t(1,1) = tau(1)
-      t(1,2) = dot12
-      t(2,2) = tau(2)
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfg2_1dcomm_finalize_tmatrix")
-#endif
-
-    end subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix
-
-    subroutine qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,k,idx,mb,PQRPARAM,rev,mpicomm,actualk)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-
-      ! input variables (local)
-      integer(kind=ik)    :: lda,lwork,ldv,ldt
-      real(kind=rk)       :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*)
-
-      ! input variables (global)
-      integer(kind=ik)    :: m,k,idx,baseidx,mb,rev,mpicomm
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)    ::PQRPARAM(*)
-#else
-      integer(kind=ik)    :: PQRPARAM(:)
-#endif
-      ! output variables (global)
-      integer(kind=ik)    :: actualk
-
-      ! local scalars
-      integer(kind=ik)    :: ivector
-      real(kind=rk)       :: pdlarfg_size(1),pdlarf_size(1)
-      real(kind=rk)       :: pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1)
-      real(kind=rk)       :: pdlarfgk_1dcomm_update_size(1)
-      integer(kind=ik)    :: seedC_size,seedC_offset
-      integer(kind=ik)    :: seedD_size,seedD_offset
-      integer(kind=ik)    :: work_offset
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfgk_1dcomm")
-#endif
-
-      seedC_size = k*k
-      seedC_offset = 1
-      seedD_size = k*k
-      seedD_offset = seedC_offset + seedC_size
-      work_offset = seedD_offset + seedD_size
-
-      if (lwork .eq. -1) then
-        call qr_pdlarfg_1dcomm(a,1,tau(1),pdlarfg_size(1),-1,m,baseidx,mb,PQRPARAM(4),rev,mpicomm)
-
-        call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,k,baseidx,mb,rev,mpicomm)
-        call qr_pdlarfgk_1dcomm_seed(a,lda,baseidx,pdlarfgk_1dcomm_seed_size(1),-1,work,work,m,k,mb,mpicomm)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-        !call qr_pdlarfgk_1dcomm_check(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
-        call qr_pdlarfgk_1dcomm_check_improved(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk)
-
-#else
-        !call qr_pdlarfgk_1dcomm_check(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
-        call qr_pdlarfgk_1dcomm_check_improved(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk)
-#endif
-        call qr_pdlarfgk_1dcomm_update(a,lda,baseidx,pdlarfgk_1dcomm_update_size(1),-1,work,work,k,k,1,work,m,mb,rev,mpicomm)
-        work(1) = max(pdlarfg_size(1),pdlarf_size(1),pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1), &
-                        pdlarfgk_1dcomm_update_size(1)) + real(seedC_size + seedD_size, kind=rk)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfgk_1dcomm")
-#endif
-
-        return
-      end if
-
-      call qr_pdlarfgk_1dcomm_seed(a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset),work(seedD_offset),m,k,mb,mpicomm)
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      !call qr_pdlarfgk_1dcomm_check(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
-      call qr_pdlarfgk_1dcomm_check_improved(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk)
-#else
-      !call qr_pdlarfgk_1dcomm_check(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk)
-      call qr_pdlarfgk_1dcomm_check_improved(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk)
-#endif
-      !print *,'possible rank:', actualk
-
-      ! override useful for debugging
-      !actualk = 1
-      !actualk = k
-      !actualk= min(actualk,2)
-      do ivector=1,actualk
-        call qr_pdlarfgk_1dcomm_vector(a(1,k-ivector+1),1,idx,tau(k-ivector+1), &
-                                          work(seedC_offset),work(seedD_offset),k, &
-                                          ivector,m,mb,rev,mpicomm)
-
-        call qr_pdlarfgk_1dcomm_update(a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), &
-                                          work(seedD_offset),k,actualk,ivector,tau, &
-                                          m,mb,rev,mpicomm)
-
-        call qr_pdlarfg_copy_1dcomm(a(1,k-ivector+1),1, &
-                                       v(1,k-ivector+1),1, &
-                                       m,baseidx,idx-ivector+1,mb,1,mpicomm)
-      end do
-
-      ! generate final T matrix and convert preliminary tau values into real ones
-      call qr_pdlarfgk_1dcomm_generateT(work(seedC_offset),work(seedD_offset),k,actualk,tau,t,ldt)
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfgk_1dcomm")
-#endif
-    end subroutine qr_pdlarfgk_1dcomm
-
-    subroutine qr_pdlarfgk_1dcomm_seed(a,lda,baseidx,work,lwork,seedC,seedD,m,k,mb,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-
-      ! input variables (local)
-      integer(kind=ik)   :: lda,lwork
-      real(kind=rk)      :: a(lda,*), work(*)
-
-      ! input variables (global)
-      integer(kind=ik)   :: m,k,baseidx,mb,mpicomm
-      real(kind=rk)      :: seedC(k,*),seedD(k,*)
-
-      ! output variables (global)
-
-      ! derived input variables from QR_PQRPARAM
-
-      ! local scalars
-      integer(kind=ik)   :: mpierr,mpirank,mpiprocs,mpirank_top
-      integer(kind=ik)   :: icol,irow,lidx,remsize
-      integer(kind=ik)   :: remaining_rank
-
-      integer(kind=ik)   :: C_size,D_size,sendoffset,recvoffset,sendrecv_size
-      integer(kind=ik)   :: localoffset,localsize,baseoffset
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfgk_1dcomm_seed")
-#endif
-
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-      C_size = k*k
-      D_size = k*k
-      sendoffset = 1
-      sendrecv_size = C_size+D_size
-      recvoffset = sendoffset + sendrecv_size
-
-      if (lwork .eq. -1) then
-        work(1) = DBLE(2*sendrecv_size)
-        return
-      end if
-
-      ! clear buffer
-      work(sendoffset:sendoffset+sendrecv_size-1)=0.0d0
-
-      ! collect C part
-      do icol=1,k
-
-        remaining_rank = k
-        do while (remaining_rank .gt. 0)
-          irow = k - remaining_rank + 1
-          lidx = baseidx - remaining_rank + 1
-
-          ! determine chunk where the current top element is located
-          mpirank_top = MOD((lidx-1)/mb,mpiprocs)
-
-          ! limit max number of remaining elements of this chunk to the block
-          ! distribution parameter
-          remsize = min(remaining_rank,mb)
-
-          ! determine the number of needed elements in this chunk
-          call local_size_offset_1d(lidx+remsize-1,mb, &
-                                    lidx,lidx,0, &
-                                    mpirank_top,mpiprocs, &
-                                    localsize,baseoffset,localoffset)
-
-          !print *,'local rank',localsize,localoffset
-
-          if (mpirank .eq. mpirank_top) then
-            ! copy elements to buffer
-            work(sendoffset+(icol-1)*k+irow-1:sendoffset+(icol-1)*k+irow-1+localsize-1) &
-                          = a(localoffset:localoffset+remsize-1,icol)
-          end if
-
-          ! jump to next chunk
-          remaining_rank = remaining_rank - localsize
-        end do
-      end do
-
-      ! collect D part
-      call local_size_offset_1d(m,mb,baseidx-k,baseidx-k,1, &
-          						  mpirank,mpiprocs, &
-          						  localsize,baseoffset,localoffset)
-
-      !print *,'localsize',localsize,localoffset
-      if (localsize > 0) then
-        call dsyrk("Upper", "Trans", k, localsize, &
-                     1.0d0, a(localoffset,1), lda, &
-                     0.0d0, work(sendoffset+C_size), k)
-      else
-        work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0d0
-      end if
-
-      ! TODO: store symmetric part more efficiently
-
-      ! allreduce operation on results
-#ifdef WITH_MPI
-      call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, &
-                         mpi_real8,mpi_sum,mpicomm,mpierr)
-#else
-      work(recvoffset:recvoffset+sendrecv_size-1) = work(sendoffset:sendoffset+sendrecv_size-1)
-#endif
-      ! unpack result from buffer into seedC and seedD
-      seedC(1:k,1:k) = 0.0d0
-      do icol=1,k
-        seedC(1:k,icol) = work(recvoffset+(icol-1)*k:recvoffset+icol*k-1)
-      end do
-
-      seedD(1:k,1:k) = 0.0d0
-      do icol=1,k
-        seedD(1:k,icol) = work(recvoffset+C_size+(icol-1)*k:recvoffset+C_size+icol*k-1)
-      end do
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%stop("qr_pdlarfgk_1dcomm_seed")
-#endif
-
-    end subroutine qr_pdlarfgk_1dcomm_seed
-
-    ! k is assumed to be larger than two
-    subroutine qr_pdlarfgk_1dcomm_check_improved(seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (global)
-      integer(kind=ik)   :: k,lwork
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)   :: PQRPARAM(*)
-
-#else
-      integer(kind=ik)   :: PQRPARAM(:)
-#endif
-      real(kind=rk)      :: seedC(k,*),seedD(k,*),work(k,*)
-
-      ! output variables (global)
-      integer(kind=ik)   :: possiblerank
-
-      ! derived input variables from QR_PQRPARAM
-      integer(kind=ik)   :: eps
-
-      ! local variables
-      integer(kind=ik)   :: i,j,l
-      real(kind=rk)      :: sum_squares,diagonal_square,relative_error,epsd,diagonal_root
-      real(kind=rk)      :: dreverse_matrix_work(1)
-
-      ! external functions
-      real(kind=rk), external :: ddot,dlapy2,dnrm2
-      external                :: dscal
-
-#ifdef HAVE_DETAILED_TIMINGS
-      call timer%start("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-
-      if (lwork .eq. -1) then
-        call reverse_matrix_local(1,k,k,work,k,dreverse_matrix_work,-1)
-        work(1,1) = DBLE(k*k) + dreverse_matrix_work(1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-        return
-      end if
-
-      eps = PQRPARAM(3)
-
-      if (eps .eq. 0) then
-        possiblerank = k
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-        return
-      end if
-
-      epsd = DBLE(eps)
-
-      ! build complete inner product from seedC and seedD
-      ! copy seedD to work
-      work(:,1:k) = seedD(:,1:k)
-
-      ! add inner products of seedC to work
-      call dsyrk("Upper", "Trans", k, k, &
-                 1.0d0, seedC(1,1), k, &
-                 1.0d0, work, k)
-
-      ! TODO: optimize this part!
-      call reverse_matrix_local(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
-      call reverse_matrix_local(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
-
-      ! transpose matrix
-      do i=1,k
-        do j=i+1,k
-          work(i,j) = work(j,i)
-        end do
-      end do
-
-
-      ! do cholesky decomposition
-      i = 0
-      do while ((i .lt. k))
-        i = i + 1
-
-        diagonal_square = abs(work(i,i))
-        diagonal_root  = sqrt(diagonal_square)
-
-        ! zero Householder vector (zero norm) case
-        if ((abs(diagonal_square) .eq. 0.0d0) .or. (abs(diagonal_root) .eq. 0.0d0)) then
-          possiblerank = max(i-1,1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-          return
-        end if
-
-        ! check if relative error is bounded for each Householder vector
-        ! Householder i is stable iff Househoulder i-1 is "stable" and the accuracy criterion
-        ! holds.
-        ! first Householder vector is considered as "stable".
-
-        do j=i+1,k
-          work(i,j) = work(i,j) / diagonal_root
-          do l=i+1,j
-            work(l,j) = work(l,j) - work(i,j) * work(i,l)
-          end do
-        end do
-        !print *,'cholesky step done'
-
-        ! build sum of squares
-        if (i .eq. 1) then
-          sum_squares = 0.0d0
-        else
-          sum_squares = ddot(i-1,work(1,i),1,work(1,i),1)
-        end if
-        !relative_error = sum_squares / diagonal_square
-        !print *,'error ',i,sum_squares,diagonal_square,relative_error
-
-        if (sum_squares .ge. (epsd * diagonal_square)) then
-          possiblerank = max(i-1,1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-          return
-        end if
-      end do
-
-      possiblerank = i
-      !print *,'possible rank', possiblerank
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check_improved")
-#endif
-
-    end subroutine qr_pdlarfgk_1dcomm_check_improved
-
-    ! TODO: zero Householder vector (zero norm) case
-    ! - check alpha values as well (from seedC)
-    subroutine qr_pdlarfgk_1dcomm_check(seedC,seedD,k,PQRPARAM,work,lwork,possiblerank)
-      use precision
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-
-      ! input variables (local)
-
-      ! input variables (global)
-      integer(kind=ik)   :: k,lwork
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      integer(kind=ik)   :: PQRPARAM(*)
-#else
-      integer(kind=ik)   :: PQRPARAM(:)
-#endif
-      real(kind=rk)      :: seedC(k,*),seedD(k,*),work(k,*)
-
-      ! output variables (global)
-      integer(kind=ik)   :: possiblerank
-
-      ! derived input variables from QR_PQRPARAM
-      integer(kind=ik)   :: eps
-
-      ! local scalars
-      integer(kind=ik)   :: icol,isqr,iprod
-      real(kind=rk)      :: epsd,sum_sqr,sum_products,diff,temp,ortho,ortho_sum
-      real(kind=rk)      :: dreverse_matrix_work(1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfgk_1dcomm_check")
-#endif
-      if (lwork .eq. -1) then
-        call reverse_matrix_local(1,k,k,work,k,dreverse_matrix_work,-1)
-        work(1,1) = DBLE(k*k) + dreverse_matrix_work(1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-
-        return
-      end if
-
-      eps = PQRPARAM(3)
-
-      if (eps .eq. 0) then
-        possiblerank = k
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-        return
-      end if
-
-      epsd = DBLE(eps)
-
-
-      ! copy seedD to work
-      work(:,1:k) = seedD(:,1:k)
-
-      ! add inner products of seedC to work
-      call dsyrk("Upper", "Trans", k, k, &
-                 1.0d0, seedC(1,1), k, &
-                 1.0d0, work, k)
-
-      ! TODO: optimize this part!
-      call reverse_matrix_local(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
-      call reverse_matrix_local(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k)
-
-      ! transpose matrix
-      do icol=1,k
-      	do isqr=icol+1,k
-          work(icol,isqr) = work(isqr,icol)
-      	end do
-      end do
-
-      ! work contains now the full inner product of the global (sub-)matrix
-      do icol=1,k
-        ! zero Householder vector (zero norm) case
-        if (abs(work(icol,icol)) .eq. 0.0d0) then
-          !print *,'too small ', icol, work(icol,icol)
-          possiblerank = max(icol,1)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-          return
-        end if
-
-        sum_sqr = 0.0d0
-        do isqr=1,icol-1
-          sum_products = 0.0d0
-          do iprod=1,isqr-1
-            sum_products = sum_products + work(iprod,isqr)*work(iprod,icol)
-          end do
-
-          !print *,'divisor',icol,isqr,work(isqr,isqr)
-          temp = (work(isqr,icol) - sum_products)/work(isqr,isqr)
-          work(isqr,icol) = temp
-          sum_sqr = sum_sqr + temp*temp
-        end do
-
-        ! calculate diagonal value
-        diff = work(icol,icol) - sum_sqr
-        if (diff .lt. 0.0d0) then
-          ! we definitely have a problem now
-          possiblerank = icol-1 ! only decompose to previous column (including)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-          return
-        end if
-        work(icol,icol) = sqrt(diff)
-        ! calculate orthogonality
-        ortho = 0.0d0
-        do isqr=1,icol-1
-          ortho_sum = 0.0d0
-          do iprod=isqr,icol-1
-            temp = work(isqr,iprod)*work(isqr,iprod)
-            !print *,'ortho ', work(iprod,iprod)
-            temp = temp / (work(iprod,iprod)*work(iprod,iprod))
-            ortho_sum = ortho_sum + temp
-          end do
-          ortho = ortho + ortho_sum * (work(isqr,icol)*work(isqr,icol))
-        end do
-
-        ! ---------------- with division by zero ----------------------- !
-
-        !ortho = ortho / diff;
-
-        ! if current estimate is not accurate enough, the following check holds
-        !if (ortho .gt. epsd) then
-        !    possiblerank = icol-1 ! only decompose to previous column (including)
-        !    return
-        !end if
-
-        ! ---------------- without division by zero ----------------------- !
-
-        ! if current estimate is not accurate enough, the following check holds
-        if (ortho .gt. epsd * diff) then
-          possiblerank = icol-1 ! only decompose to previous column (including)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-          return
-        end if
-      end do
-
-      ! if we get to this point, the accuracy condition holds for the whole block
-      possiblerank = k
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_check")
-#endif
-    end subroutine qr_pdlarfgk_1dcomm_check
-
-    !sidx: seed idx
-    !k: max rank used during seed phase
-    !rank: actual rank (k >= rank)
-    subroutine qr_pdlarfgk_1dcomm_vector(x,incx,baseidx,tau,seedC,seedD,k,sidx,n,nb,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)  :: incx
-      real(kind=rk)     :: x(*),tau
-
-      ! input variables (global)
-      integer(kind=ik)  :: n,nb,baseidx,rev,mpicomm,k,sidx
-      real(kind=rk)     :: seedC(k,*),seedD(k,*)
-
-      ! output variables (global)
-
-      ! external functions
-      real(kind=rk), external :: dlapy2,dnrm2
-      external                :: dscal
-
-      ! local scalars
-      integer(kind=ik)   :: mpirank,mpirank_top,mpiprocs,mpierr
-      real(kind=rk)      :: alpha,dot,beta,xnorm
-      integer(kind=ik)   :: local_size,baseoffset,local_offset,top,topidx
-      integer(kind=ik)   :: lidx
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfgk_1dcomm_vector")
-#endif
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-      lidx = baseidx-sidx+1
-      call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, &
-          						  local_size,baseoffset,local_offset)
-
-      local_offset = local_offset * incx
-
-      ! Processor id for global index of top element
-      mpirank_top = MOD((lidx-1)/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        topidx = local_index((lidx),mpirank_top,mpiprocs,nb,0)
-        top = 1+(topidx-1)*incx
-      end if
-
-      alpha = seedC(k-sidx+1,k-sidx+1)
-      dot = seedD(k-sidx+1,k-sidx+1)
-      ! assemble actual norm from both seed parts
-      xnorm = dlapy2(sqrt(dot), dnrm2(k-sidx,seedC(1,k-sidx+1),1))
-
-      if (xnorm .eq. 0.0d0) then
-        tau = 0.0d0
-      else
-        ! General case
-
-        beta = sign(dlapy2(alpha, xnorm), alpha)
-        ! store a preliminary version of beta in tau
-        tau = beta
-
-        ! update global part
-        call dscal(local_size, 1.0d0/(beta+alpha), &
-                     x(local_offset), incx)
-
-        ! do not update local part here due to
-        ! dependency of c vector during update process
-
-        ! TODO: reimplement norm rescale method of
-        ! original PDLARFG using mpi?
-
-        if (mpirank .eq. mpirank_top) then
-          x(top) = -beta
-        end if
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_vector")
-#endif
-
-    end subroutine qr_pdlarfgk_1dcomm_vector
-
-    !k: original max rank used during seed function
-    !rank: possible rank as from check function
-    ! TODO: if rank is less than k, reduce buffersize in such a way
-    ! that only the required entries for the next pdlarfg steps are
-    ! computed
-    subroutine qr_pdlarfgk_1dcomm_update(a,lda,baseidx,work,lwork,seedC,seedD,k,rank,sidx,tau,n,nb,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! parameter setup
-      INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4
-
-      ! input variables (local)
-      integer(kind=ik)            :: lda,lwork
-      real(kind=rk)               :: a(lda,*),work(*)
-
-      ! input variables (global)
-      integer(kind=ik)            :: k,rank,sidx,n,baseidx,nb,rev,mpicomm
-      real(kind=rk)               :: beta
-
-      ! output variables (global)
-      real(kind=rk)               :: seedC(k,*),seedD(k,*),tau(*)
-
-      ! derived input variables from QR_PQRPARAM
-
-      ! local scalars
-      real(kind=rk)               :: alpha
-      integer(kind=ik)            :: coffset,zoffset,yoffset,voffset,buffersize
-      integer(kind=ik)            :: mpirank,mpierr,mpiprocs,mpirank_top
-      integer(kind=ik)            :: localsize,baseoffset,localoffset,topidx
-      integer(kind=ik)            :: lidx
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfgk_1dcomm_update")
-#endif
-      if (lwork .eq. -1) then
-        ! buffer for c,z,y,v
-        work(1) = 4*k
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-
-        return
-      end if
-
-      ! nothing to update anymore
-      if (sidx .gt. rank) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-        return
-      endif
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-      lidx = baseidx-sidx
-      if (lidx .lt. 1) then
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-        return
-      endif
-
-      call local_size_offset_1d(n,nb,baseidx,lidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,localoffset)
-
-      coffset = 1
-      zoffset = coffset + k
-      yoffset = zoffset + k
-      voffset = yoffset + k
-      buffersize = k - sidx
-
-      ! finalize tau values
-      alpha = seedC(k-sidx+1,k-sidx+1)
-      beta = tau(k-sidx+1)
-
-      ! zero Householder vector (zero norm) case
-      !print *,'k update: alpha,beta',alpha,beta
-      if ((beta .eq. 0.0d0) .or. (alpha .eq. 0.0d0))  then
-        tau(k-sidx+1) = 0.0d0
-        seedC(k,k-sidx+1) = 0.0d0
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-        return
-      end if
-
-      tau(k-sidx+1) = (beta+alpha) / beta
-
-      ! ---------------------------------------
-      ! calculate c vector (extra vector or encode in seedC/seedD?
-      work(coffset:coffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
-      call dgemv("Trans", buffersize+1, buffersize, &
-                 1.0d0,seedC(1,1),k,seedC(1,k-sidx+1),1, &
-                 1.0d0,work(coffset),1)
-
-      ! calculate z using tau,seedD,seedC and c vector
-      work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize)
-      call daxpy(buffersize, 1.0d0/beta, work(coffset), 1, work(zoffset), 1)
-
-      ! update A1(local copy) and generate part of householder vectors for use
-      call daxpy(buffersize, -1.0d0, work(zoffset),1,seedC(k-sidx+1,1),k)
-      call dscal(buffersize, 1.0d0/(alpha+beta), seedC(1,k-sidx+1),1)
-      call dger(buffersize, buffersize, -1.0d0, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k)
-
-      ! update A global (householder vector already generated by pdlarfgk)
-      mpirank_top = MOD(lidx/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        ! handle first row separately
-        topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0)
-        call daxpy(buffersize,-1.0d0,work(zoffset),1,a(topidx,1),lda)
-      end if
-
-      call dger(localsize, buffersize,-1.0d0, &
-                a(localoffset,k-sidx+1),1,work(zoffset),1, &
-                a(localoffset,1),lda)
-
-      ! update D (symmetric) => two buffer vectors of size rank
-      ! generate y vector
-      work(yoffset:yoffset+buffersize-1) = 0.d0
-      call daxpy(buffersize,1.0d0/(alpha+beta),work(zoffset),1,work(yoffset),1)
-
-      ! generate v vector
-      work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1)
-      call daxpy(buffersize, -0.5d0*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1)
-
-      ! symmetric update of D using y and v
-      call dsyr2("Upper", buffersize,-1.0d0, &
-                     work(yoffset),1,work(voffset),1, &
-                     seedD(1,1), k)
-
-      ! prepare T matrix inner products
-      ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y'
-      ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements
-      call dger(k-sidx,sidx,-1.0d0,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k)
-      seedC(k,k-sidx+1) = 1.0d0/(alpha+beta)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_update")
-#endif
-    end subroutine qr_pdlarfgk_1dcomm_update
-
-    subroutine qr_pdlarfgk_1dcomm_generateT(seedC,seedD,k,actualk,tau,t,ldt)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      integer(kind=ik)  :: k,actualk,ldt
-      real(kind=rk)     :: seedC(k,*),seedD(k,*),tau(*),t(ldt,*)
-
-      integer(kind=ik)  :: irow,icol
-      real(kind=rk)     :: column_coefficient
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfgk_1dcomm_generateT")
-#endif
-
-      !print *,'reversed on the fly T generation NYI'
-
-      do icol=1,actualk-1
-        ! calculate inner product of householder vector parts in seedC
-        ! (actually calculating more than necessary, if actualk < k)
-        ! => a lot of junk from row 1 to row k-actualk
-        call dtrmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1)
-
-        ! add scaled D parts to current column of C (will become later T rows)
-        column_coefficient = seedC(k,k-icol+1)
-        do irow=k-actualk+1,k-1
-          seedC(irow,k-icol+1) = ( seedC(irow,k-icol+1) ) +  ( seedD(irow,k-icol+1) * column_coefficient * seedC(k,irow) )
-        end do
-      end do
-
-      call qr_dlarft_kernel(actualk,tau(k-actualk+1),seedC(k-actualk+1,k-actualk+2),k,t(k-actualk+1,k-actualk+1),ldt)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfgk_1dcomm_generateT")
-#endif
-
-    end subroutine qr_pdlarfgk_1dcomm_generateT
-
-    !direction=0: pack into work buffer
-    !direction=1: unpack from work buffer
-    subroutine qr_pdgeqrf_pack_unpack(v,ldv,work,lwork,m,n,mb,baseidx,rowidx,rev,direction,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)   :: ldv,lwork
-      real(kind=rk)      :: v(ldv,*), work(*)
-
-      ! input variables (global)
-      integer(kind=ik)   :: m,n,mb,baseidx,rowidx,rev,direction,mpicomm
-
-      ! output variables (global)
-
-      ! local scalars
-      integer(kind=ik)   :: mpierr,mpirank,mpiprocs
-      integer(kind=ik)   :: buffersize,icol
-      integer(kind=ik)   :: local_size,baseoffset,offset
-
-      ! external functions
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdgeqrf_pack_unpack")
-#endif
-      call mpi_comm_rank(mpicomm,mpirank,mpierr)
-      call mpi_comm_size(mpicomm,mpiprocs,mpierr)
-      call local_size_offset_1d(m,mb,baseidx,rowidx,rev,mpirank,mpiprocs, &
-                                    local_size,baseoffset,offset)
-
-      !print *,'pack/unpack',local_size,baseoffset,offset
-
-      ! rough approximate for buffer size
-      if (lwork .eq. -1) then
-        buffersize = local_size * n ! vector elements
-        work(1) = DBLE(buffersize)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdgeqrf_pack_unpack")
-#endif
-
-        return
-      end if
-
-      if (direction .eq. 0) then
-        ! copy v part to buffer (including zeros)
-        do icol=1,n
-          work(1+local_size*(icol-1):local_size*icol) = v(baseoffset:baseoffset+local_size-1,icol)
-        end do
-      else
-        ! copy v part from buffer (including zeros)
-        do icol=1,n
-          v(baseoffset:baseoffset+local_size-1,icol) = work(1+local_size*(icol-1):local_size*icol)
-        end do
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdgeqrf_pack_unpack")
-#endif
-
-      return
-
-    end subroutine qr_pdgeqrf_pack_unpack
-
-    !direction=0: pack into work buffer
-    !direction=1: unpack from work buffer
-    subroutine qr_pdgeqrf_pack_unpack_tmatrix(tau,t,ldt,work,lwork,n,direction)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)  :: ldt,lwork
-      real(kind=rk)     :: work(*), t(ldt,*),tau(*)
-
-      ! input variables (global)
-      integer(kind=ik)  :: n,direction
-
-      ! output variables (global)
-
-      ! local scalars
-      integer(kind=ik)  :: icol
-
-      ! external functions
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdgeqrf_pack_unpack_tmatrix")
-#endif
-
-
-      if (lwork .eq. -1) then
-        work(1) = DBLE(n*n)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdgeqrf_pack_unpack_tmatrix")
-#endif
-
-        return
-      end if
-
-      if (direction .eq. 0) then
-        ! append t matrix to buffer (including zeros)
-        do icol=1,n
-          work(1+(icol-1)*n:icol*n) = t(1:n,icol)
-        end do
-      else
-        ! append t matrix from buffer (including zeros)
-        do icol=1,n
-          t(1:n,icol) = work(1+(icol-1)*n:icol*n)
-          tau(icol) = t(icol,icol)
-        end do
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdgeqrf_pack_unpack_tmatrix")
-#endif
-    end subroutine qr_pdgeqrf_pack_unpack_tmatrix
-
-
-    ! TODO: encode following functionality
-    !   - Direction? BOTTOM UP or TOP DOWN ("Up", "Down")
-    !        => influences all related kernels (including DLARFT / DLARFB)
-    !   - rank-k parameter (k=1,2,...,b)
-    !        => influences possible update strategies
-    !        => parameterize the function itself? (FUNCPTR, FUNCARG)
-    !   - Norm mode? Allreduce, Allgather, AlltoAll, "AllHouse", (ALLNULL = benchmarking local kernels)
-    !   - subblocking
-    !         (maximum block size bounded by data distribution along rows)
-    !   - blocking method (householder vectors only or compact WY?)
-    !   - update strategy of trailing parts (incremental, complete)
-    !        - difference for subblocks and normal blocks? (UPDATE and UPDATESUB)
-    !        o "Incremental"
-    !        o "Full"
-    !   - final T generation (recursive: subblock wise, block wise, end) (TMERGE)
-    !        ' (implicitly given by / influences update strategies?)
-    !        => alternative: during update: iterate over sub t parts
-    !           => advantage: smaller (cache aware T parts)
-    !           => disadvantage: more memory write backs
-    !                (number of T parts * matrix elements)
-    !   - partial/sub T generation (TGEN)
-    !        o add vectors right after creation (Vector)
-    !        o add set of vectors (Set)
-    !   - bcast strategy of householder vectors to other process columns
-    !        (influences T matrix generation and trailing update
-    !         in other process columns)
-    !        o no broadcast (NONE = benchmarking?,
-    !            or not needed due to 1D process grid)
-    !        o after every housegen (VECTOR)
-    !        o after every subblk   (SUBBLOCK)
-    !        o after full local column block decomposition (BLOCK)
-    !  LOOP Housegen -> BCAST -> GENT/EXTENDT -> LOOP HouseLeft
-
-    !subroutine qr_pqrparam_init(PQRPARAM, DIRECTION, RANK, NORMMODE, &
-    !                             SUBBLK, UPDATE, TGEN, BCAST)
-    ! gmode: control communication pattern of dlarfg
-    ! maxrank: control max number of householder vectors per communication
-    ! eps: error threshold (integer)
-    ! update*: control update pattern in pdgeqr2_1dcomm ('incremental','full','merge')
-    !               merging = full update with tmatrix merging
-    ! tmerge*: 0: do not merge, 1: incremental merge, >1: recursive merge
-    !               only matters if update* == full
-    subroutine qr_pqrparam_init(pqrparam,size2d,update2d,tmerge2d,size1d,update1d,tmerge1d,maxrank,update,eps,hgmode)
-      use precision
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input
-      CHARACTER         :: update2d,update1d,update,hgmode
-      INTEGER(kind=ik)  :: size2d,size1d,maxrank,eps,tmerge2d,tmerge1d
-
-      ! output
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
-      INTEGER(kind=ik)  :: PQRPARAM(*)
-#else
-      INTEGER(kind=ik)  :: PQRPARAM(1:11)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pqrparam_init")
-#endif
-
-      PQRPARAM(1) = size2d
-      PQRPARAM(2) = ichar(update2d)
-      PQRPARAM(3) = tmerge2d
-      ! TODO: broadcast T yes/no
-
-      PQRPARAM(4) = size1d
-      PQRPARAM(5) = ichar(update1d)
-      PQRPARAM(6) = tmerge1d
-
-      PQRPARAM(7) = maxrank
-      PQRPARAM(8) = ichar(update)
-      PQRPARAM(9) = eps
-      PQRPARAM(10) = ichar(hgmode)
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pqrparam_init")
-#endif
-
-    end subroutine qr_pqrparam_init
-
-    subroutine qr_pdlarfg_copy_1dcomm(x,incx,v,incv,n,baseidx,idx,nb,rev,mpicomm)
-      use precision
-      use ELPA1
-      use qr_utils_mod
-#ifdef HAVE_DETAILED_TIMINGS
-      use timings
-#endif
-      implicit none
-
-      ! input variables (local)
-      integer(kind=ik)  :: incx,incv
-      real(kind=rk)     :: x(*), v(*)
-
-      ! input variables (global)
-      integer(kind=ik)  :: baseidx,idx,rev,nb,n
-      integer(kind=ik)  :: mpicomm
-
-      ! output variables (global)
-
-      ! local scalars
-      integer(kind=ik)  :: mpierr,mpiprocs
-      integer(kind=ik)  :: mpirank,mpirank_top
-      integer(kind=ik)  :: irow,x_offset
-      integer(kind=ik)  :: v_offset,local_size
-
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%start("qr_pdlarfg_copy_1dcomm")
-#endif
-      call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-      call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-      call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, &
-                                local_size,v_offset,x_offset)
-      v_offset = v_offset * incv
-
-      !print *,'copy:',mpirank,baseidx,v_offset,x_offset,local_size
-
-      ! copy elements
-      do irow=1,local_size
-        v((irow-1)*incv+v_offset) = x((irow-1)*incx+x_offset)
-      end do
-
-      ! replace top element to build an unitary vector
-      mpirank_top = MOD((idx-1)/nb,mpiprocs)
-      if (mpirank .eq. mpirank_top) then
-        v(local_size*incv) = 1.0d0
-      end if
-#ifdef HAVE_DETAILED_TIMINGS
-        call timer%stop("qr_pdlarfg_copy_1dcomm")
-#endif
-
-    end subroutine qr_pdlarfg_copy_1dcomm
-
-end module elpa_pdgeqrf
diff -Nru elpa-2016.05.001/src/elpa_qr/elpa_pdlarfb.F90 elpa-2019.11.001/src/elpa_qr/elpa_pdlarfb.F90
--- elpa-2016.05.001/src/elpa_qr/elpa_pdlarfb.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa_qr/elpa_pdlarfb.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,643 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-
-module elpa_pdlarfb
-
-    use elpa1_compute
-    use qr_utils_mod
-    use elpa_mpi
-    implicit none
-
-    PRIVATE
-
-    public :: qr_pdlarfb_1dcomm
-    public :: qr_pdlarft_pdlarfb_1dcomm
-    public :: qr_pdlarft_set_merge_1dcomm
-    public :: qr_pdlarft_tree_merge_1dcomm
-    public :: qr_pdlarfl_1dcomm
-    public :: qr_pdlarfl2_tmatrix_1dcomm
-    public :: qr_tmerge_pdlarfb_1dcomm
-
-contains
-
-subroutine qr_pdlarfb_1dcomm(m,mb,n,k,a,lda,v,ldv,tau,t,ldt,baseidx,idx,rev,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik)  :: lda,ldv,ldt,lwork
-    real(kind=rk)     :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(k,*)
-
-    ! input variables (global)
-    integer(kind=ik)  :: m,mb,n,k,baseidx,idx,rev,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik)  :: localsize,offset,baseoffset
-    integer(kind=ik)  :: mpirank,mpiprocs,mpierr
-
-        if (idx .le. 1) return
-
-    if (n .le. 0) return ! nothing to do
-
-    if (k .eq. 1) then
-        call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1), &
-                                work,lwork,m,n,idx,mb,rev,mpicomm)
-        return
-    else if (k .eq. 2) then
-        call qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt, &
-                                 work,lwork,m,n,idx,mb,rev,mpicomm)
-        return
-    end if
-
-    if (lwork .eq. -1) then
-        work(1,1) = DBLE(2*k*n)
-        return
-    end if
-
-    !print *,'updating trailing matrix with k=',k
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-    ! use baseidx as idx here, otherwise the upper triangle part will be lost
-    ! during the calculation, especially in the reversed case
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    ! Z' = Y' * A
-    if (localsize .gt. 0) then
-        call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0,work(1,1),k)
-    else
-        work(1:k,1:n) = 0.0d0
-    end if
-
-    ! data exchange
-#ifdef WITH_MPI
-    call mpi_allreduce(work(1,1),work(1,n+1),k*n,mpi_real8,mpi_sum,mpicomm,mpierr)
-#else
-    work(1:k*n,n+1) = work(1:k*n,1)
-#endif
-    call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t,ldt,work(1,n+1),k)
-end subroutine qr_pdlarfb_1dcomm
-
-! generalized pdlarfl2 version
-! TODO: include T merge here (seperate by "old" and "new" index)
-subroutine qr_pdlarft_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,tau,t,ldt,a,lda,baseidx,rev,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik)  :: ldv,ldt,lda,lwork
-    real(kind=rk)     :: v(ldv,*),tau(*),t(ldt,*),work(k,*),a(lda,*)
-
-    ! input variables (global)
-    integer(kind=ik)  :: m,mb,n,k,oldk,baseidx,rev,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik)  :: localsize,offset,baseoffset
-    integer(kind=ik)  :: mpirank,mpiprocs,mpierr
-    integer(kind=ik)  :: icol
-
-    integer(kind=ik)  :: sendoffset,recvoffset,sendsize
-
-    sendoffset = 1
-    sendsize = k*(k+n+oldk)
-    recvoffset = sendoffset+(k+n+oldk)
-
-    if (lwork .eq. -1) then
-        work(1,1) = DBLE(2*(k*k+k*n+oldk))
-        return
-    end if
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    if (localsize .gt. 0) then
-            ! calculate inner product of householdervectors
-            call dsyrk("Upper","Trans",k,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),k)
-
-            ! calculate matrix matrix product of householder vectors and target matrix
-            ! Z' = Y' * A
-            call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0,work(1,k+1),k)
-
-            ! TODO: reserved for T merge parts
-            work(1:k,n+k+1:n+k+oldk) = 0.0d0
-    else
-        work(1:k,1:(n+k+oldk)) = 0.0d0
-    end if
-
-    ! exchange data
-#ifdef WITH_MPI
-    call mpi_allreduce(work(1,sendoffset),work(1,recvoffset),sendsize,mpi_real8,mpi_sum,mpicomm,mpierr)
-#else
-    work(1:sendsize,recvoffset) = work(1:sendsize,sendoffset)
-#endif
-        ! generate T matrix (pdlarft)
-        t(1:k,1:k) = 0.0d0 ! DEBUG: clear buffer first
-
-        ! T1 = tau1
-        ! | tauk  Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' |
-        ! | 0           Tk-1                           |
-        t(k,k) = tau(k)
-        do icol=k-1,1,-1
-            t(icol,icol+1:k) = -tau(icol)*work(icol,recvoffset+icol:recvoffset+k-1)
-            call dtrmv("Upper","Trans","Nonunit",k-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
-            t(icol,icol) = tau(icol)
-        end do
-
-        ! TODO: elmroth and gustavson
-
-        ! update matrix (pdlarfb)
-        ! Z' = T * Z'
-        call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0d0,t,ldt,work(1,recvoffset+k),k)
-
-        ! A = A - Y * V'
-        call dgemm("Notrans","Notrans",localsize,n,k,-1.0d0,v(baseoffset,1),ldv,work(1,recvoffset+k),k,1.0d0,a(offset,1),lda)
-
-end subroutine qr_pdlarft_pdlarfb_1dcomm
-
-subroutine qr_pdlarft_set_merge_1dcomm(m,mb,n,blocksize,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik)  :: ldv,ldt,lwork
-    real(kind=rk)     :: v(ldv,*),t(ldt,*),work(n,*)
-
-    ! input variables (global)
-    integer(kind=ik)  :: m,mb,n,blocksize,baseidx,rev,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik)  :: localsize,offset,baseoffset
-    integer(kind=ik)  :: mpirank,mpiprocs,mpierr
-
-    if (lwork .eq. -1) then
-        work(1,1) = DBLE(2*n*n)
-        return
-    end if
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    if (localsize .gt. 0) then
-        call dsyrk("Upper","Trans",n,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),n)
-    else
-        work(1:n,1:n) = 0.0d0
-    end if
-#ifdef WITH_MPI
-    call mpi_allreduce(work(1,1),work(1,n+1),n*n,mpi_real8,mpi_sum,mpicomm,mpierr)
-#else
-    work(1:n,n+1:n+1+n-1) = work(1:n,1:n)
-#endif
-        ! skip Y4'*Y4 part
-        offset = mod(n,blocksize)
-        if (offset .eq. 0) offset=blocksize
-        call qr_tmerge_set_kernel(n,blocksize,t,ldt,work(1,n+1+offset),n)
-
-end subroutine qr_pdlarft_set_merge_1dcomm
-
-subroutine qr_pdlarft_tree_merge_1dcomm(m,mb,n,blocksize,treeorder,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldv,ldt,lwork
-    real(kind=rk)    :: v(ldv,*),t(ldt,*),work(n,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,mb,n,blocksize,treeorder,baseidx,rev,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik) :: localsize,offset,baseoffset
-    integer(kind=ik) :: mpirank,mpiprocs,mpierr
-
-    if (lwork .eq. -1) then
-        work(1,1) = DBLE(2*n*n)
-        return
-    end if
-
-    if (n .le. blocksize) return ! nothing to do
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    if (localsize .gt. 0) then
-        call dsyrk("Upper","Trans",n,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),n)
-    else
-        work(1:n,1:n) = 0.0d0
-    end if
-#ifdef WITH_MPI
-    call mpi_allreduce(work(1,1),work(1,n+1),n*n,mpi_real8,mpi_sum,mpicomm,mpierr)
-#else
-    work(1:n,n+1:n+1+n-1) = work(1:n,1:n)
-#endif
-        ! skip Y4'*Y4 part
-        offset = mod(n,blocksize)
-        if (offset .eq. 0) offset=blocksize
-        call qr_tmerge_tree_kernel(n,blocksize,treeorder,t,ldt,work(1,n+1+offset),n)
-
-end subroutine qr_pdlarft_tree_merge_1dcomm
-
-! apply householder vector to the left
-! - assume unitary matrix
-! - assume right positions for v
-subroutine qr_pdlarfl_1dcomm(v,incv,baseidx,a,lda,tau,work,lwork,m,n,idx,mb,rev,mpicomm)
-    use precision
-    use ELPA1
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: incv,lda,lwork,baseidx
-    real(kind=rk)    :: v(*),a(lda,*),work(*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,n,mb,rev,idx,mpicomm
-    real(kind=rk)    :: tau
-
-    ! output variables (global)
-
-    ! local scalars
-    integer(kind=ik) :: mpierr,mpirank,mpiprocs
-    integer(kind=ik) :: sendsize,recvsize,icol
-    integer(kind=ik) :: local_size,local_offset
-    integer(kind=ik) :: v_local_offset
-
-    ! external functions
-    real(kind=rk), external :: ddot
-    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-    sendsize = n
-    recvsize = sendsize
-
-    if (lwork .eq. -1) then
-        work(1) = DBLE(sendsize + recvsize)
-        return
-    end if
-
-    if (n .le. 0) return
-
-        if (idx .le. 1) return
-
-    call local_size_offset_1d(m,mb,baseidx,idx,rev,mpirank,mpiprocs, &
-                              local_size,v_local_offset,local_offset)
-
-    !print *,'hl ref',local_size,n
-
-    v_local_offset = v_local_offset * incv
-
-    if (local_size > 0) then
-
-        do icol=1,n
-            work(icol) = dot_product(v(v_local_offset:v_local_offset+local_size-1),a(local_offset:local_offset+local_size-1,icol))
-
-        end do
-    else
-        work(1:n) = 0.0d0
-    end if
-#ifdef WITH_MPI
-    call mpi_allreduce(work, work(sendsize+1), sendsize, mpi_real8, mpi_sum, mpicomm, mpierr)
-#else
-    work(sendsize+1:sendsize+1+sendsize+1+sendsize-1) = work(1:sendsize)
-#endif
-    if (local_size > 0) then
-
-         do icol=1,n
-               a(local_offset:local_offset+local_size-1,icol) = a(local_offset:local_offset+local_size-1,icol) &
-                                                                - tau*work(sendsize+icol)*v(v_local_offset:v_local_offset+ &
-                                                                           local_size-1)
-         enddo
-    end if
-
-end subroutine qr_pdlarfl_1dcomm
-
-subroutine qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,work,lwork,m,n,idx,mb,rev,mpicomm)
-    use precision
-    use ELPA1
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldv,lda,lwork,baseidx,ldt
-    real(kind=rk)    :: v(ldv,*),a(lda,*),work(*),t(ldt,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,n,mb,rev,idx,mpicomm
-
-    ! output variables (global)
-
-    ! local scalars
-    integer(kind=ik) :: mpierr,mpirank,mpiprocs,mpirank_top1,mpirank_top2
-    integer(kind=ik) :: dgemv1_offset,dgemv2_offset
-    integer(kind=ik) :: sendsize, recvsize
-    integer(kind=ik) :: local_size1,local_offset1
-    integer(kind=ik) :: local_size2,local_offset2
-    integer(kind=ik) :: local_size_dger,local_offset_dger
-    integer(kind=ik) :: v1_local_offset,v2_local_offset
-    integer(kind=ik) :: v_local_offset_dger
-    real(kind=rk)    :: hvdot
-    integer(kind=ik) :: irow,icol,v1col,v2col
-
-    ! external functions
-    real(kind=rk), external :: ddot
-    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-    sendsize = 2*n
-    recvsize = sendsize
-
-    if (lwork .eq. -1) then
-        work(1) = sendsize + recvsize
-        return
-    end if
-
-    dgemv1_offset = 1
-    dgemv2_offset = dgemv1_offset + n
-
-        ! in 2x2 matrix case only one householder vector was generated
-        if (idx .le. 2) then
-            call qr_pdlarfl_1dcomm(v(1,2),1,baseidx,a,lda,t(2,2), &
-                                    work,lwork,m,n,idx,mb,rev,mpicomm)
-            return
-        end if
-
-        call local_size_offset_1d(m,mb,baseidx,idx,rev,mpirank,mpiprocs, &
-                                  local_size1,v1_local_offset,local_offset1)
-        call local_size_offset_1d(m,mb,baseidx,idx-1,rev,mpirank,mpiprocs, &
-                                  local_size2,v2_local_offset,local_offset2)
-
-        v1_local_offset = v1_local_offset * 1
-        v2_local_offset = v2_local_offset * 1
-
-        v1col = 2
-        v2col = 1
-
-        ! keep buffers clean in case that local_size1/local_size2 are zero
-        work(1:sendsize) = 0.0d0
-
-        call dgemv("Trans",local_size1,n,1.0d0,a(local_offset1,1),lda,v(v1_local_offset,v1col),1,0.0d0,work(dgemv1_offset),1)
-        call dgemv("Trans",local_size2,n,t(v2col,v2col),a(local_offset2,1),lda,v(v2_local_offset,v2col),1,0.0d0, &
-                   work(dgemv2_offset),1)
-#ifdef WITH_MPI
-        call mpi_allreduce(work, work(sendsize+1), sendsize, mpi_real8, mpi_sum, mpicomm, mpierr)
-#else
-        work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize)
-#endif
-        ! update second vector
-        call daxpy(n,t(1,2),work(sendsize+dgemv1_offset),1,work(sendsize+dgemv2_offset),1)
-
-        call local_size_offset_1d(m,mb,baseidx,idx-2,rev,mpirank,mpiprocs, &
-                                  local_size_dger,v_local_offset_dger,local_offset_dger)
-
-        ! get ranks of processes with topelements
-        mpirank_top1 = MOD((idx-1)/mb,mpiprocs)
-        mpirank_top2 = MOD((idx-2)/mb,mpiprocs)
-
-        if (mpirank_top1 .eq. mpirank) local_offset1 = local_size1
-        if (mpirank_top2 .eq. mpirank) then
-            local_offset2 = local_size2
-            v2_local_offset = local_size2
-        end if
-
-    ! use hvdot as temporary variable
-    hvdot = t(v1col,v1col)
-    do icol=1,n
-        ! make use of "1" entries in householder vectors
-        if (mpirank_top1 .eq. mpirank) then
-            a(local_offset1,icol) = a(local_offset1,icol) &
-                                    - work(sendsize+dgemv1_offset+icol-1)*hvdot
-        end if
-
-        if (mpirank_top2 .eq. mpirank) then
-            a(local_offset2,icol) = a(local_offset2,icol) &
-                                    - v(v2_local_offset,v1col)*work(sendsize+dgemv1_offset+icol-1)*hvdot &
-                                    - work(sendsize+dgemv2_offset+icol-1)
-        end if
-
-        do irow=1,local_size_dger
-            a(local_offset_dger+irow-1,icol) = a(local_offset_dger+irow-1,icol) &
-                                    - work(sendsize+dgemv1_offset+icol-1)*v(v_local_offset_dger+irow-1,v1col)*hvdot &
-                                    - work(sendsize+dgemv2_offset+icol-1)*v(v_local_offset_dger+irow-1,v2col)
-        end do
-    end do
-
-end subroutine qr_pdlarfl2_tmatrix_1dcomm
-
-! generalized pdlarfl2 version
-! TODO: include T merge here (seperate by "old" and "new" index)
-subroutine qr_tmerge_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,baseidx,rev,updatemode,mpicomm,work,lwork)
-    use precision
-    use qr_utils_mod
-
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldv,ldt,lda,lwork
-    real(kind=rk)    :: v(ldv,*),t(ldt,*),work(*),a(lda,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,mb,n,k,oldk,baseidx,rev,updatemode,mpicomm
-
-    ! output variables (global)
-
-    ! derived input variables from QR_PQRPARAM
-
-    ! local scalars
-    integer(kind=ik) :: localsize,offset,baseoffset
-    integer(kind=ik) :: mpirank,mpiprocs,mpierr
-
-    integer(kind=ik) :: sendoffset,recvoffset,sendsize
-    integer(kind=ik) :: updateoffset,updatelda,updatesize
-    integer(kind=ik) :: mergeoffset,mergelda,mergesize
-    integer(kind=ik) :: tgenoffset,tgenlda,tgensize
-
-        if (updatemode .eq. ichar('I')) then
-            updatelda = oldk+k
-        else
-            updatelda = k
-        end if
-
-        updatesize = updatelda*n
-
-        mergelda = k
-        mergesize = mergelda*oldk
-
-        tgenlda = 0
-        tgensize = 0
-
-        sendsize = updatesize + mergesize + tgensize
-
-    if (lwork .eq. -1) then
-        work(1) = DBLE(2*sendsize)
-        return
-    end if
-    call MPI_Comm_rank(mpicomm,mpirank,mpierr)
-    call MPI_Comm_size(mpicomm,mpiprocs,mpierr)
-    ! use baseidx as idx here, otherwise the upper triangle part will be lost
-    ! during the calculation, especially in the reversed case
-    call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, &
-                                localsize,baseoffset,offset)
-
-    sendoffset = 1
-
-        if (oldk .gt. 0) then
-            updateoffset = 0
-            mergeoffset = updateoffset + updatesize
-            tgenoffset = mergeoffset + mergesize
-
-            sendsize = updatesize + mergesize + tgensize
-
-            !print *,'sendsize',sendsize,updatesize,mergesize,tgensize
-            !print *,'merging nr of rotations', oldk+k
-
-            if (localsize .gt. 0) then
-                ! calculate matrix matrix product of householder vectors and target matrix
-
-                if (updatemode .eq. ichar('I')) then
-                    ! Z' = (Y1,Y2)' * A
-                    call dgemm("Trans","Notrans",k+oldk,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, &
-                               work(sendoffset+updateoffset),updatelda)
-                else
-                    ! Z' = Y1' * A
-                    call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, &
-                               work(sendoffset+updateoffset),updatelda)
-                end if
-
-                ! calculate parts needed for T merge
-                call dgemm("Trans","Notrans",k,oldk,localsize,1.0d0,v(baseoffset,1),ldv,v(baseoffset,k+1),ldv,0.0d0, &
-                           work(sendoffset+mergeoffset),mergelda)
-
-            else
-                ! cleanup buffer
-                work(sendoffset:sendoffset+sendsize-1) = 0.0d0
-            end if
-        else
-            ! do not calculate parts for T merge as there is nothing to merge
-
-            mergeoffset  = 0
-            updateoffset = 0
-
-            tgenoffset = updateoffset + updatesize
-
-            sendsize = updatesize + tgensize
-
-            if (localsize .gt. 0) then
-                ! calculate matrix matrix product of householder vectors and target matrix
-                ! Z' = (Y1)' * A
-                call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, &
-                           work(sendoffset+updateoffset),updatelda)
-
-            else
-                ! cleanup buffer
-                work(sendoffset:sendoffset+sendsize-1) = 0.0d0
-            end if
-
-        end if
-
-    recvoffset = sendoffset + sendsize
-
-    if (sendsize .le. 0) return ! nothing to do
-
-    ! exchange data
-#ifdef WITH_MPI
-    call mpi_allreduce(work(sendoffset),work(recvoffset),sendsize,mpi_real8,mpi_sum,mpicomm,mpierr)
-#else
-    work(recvoffset:recvoffset+sendsize-1) = work(sendoffset:sendoffset+sendsize-1)
-#endif
-    updateoffset = recvoffset+updateoffset
-    mergeoffset = recvoffset+mergeoffset
-    tgenoffset = recvoffset+tgenoffset
-
-        if (oldk .gt. 0) then
-            call qr_pdlarft_merge_kernel_local(oldk,k,t,ldt,work(mergeoffset),mergelda)
-
-            if (localsize .gt. 0) then
-                if (updatemode .eq. ichar('I')) then
-
-                    ! update matrix (pdlarfb) with complete T
-                    call qr_pdlarfb_kernel_local(localsize,n,k+oldk,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
-                                                 work(updateoffset),updatelda)
-                else
-                    ! update matrix (pdlarfb) with small T (same as update with no old T TODO)
-                    call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
-                                                 work(updateoffset),updatelda)
-                end if
-            end if
-        else
-            if (localsize .gt. 0) then
-                ! update matrix (pdlarfb) with small T
-                call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, &
-                                             work(updateoffset),updatelda)
-            end if
-        end if
-
-end subroutine qr_tmerge_pdlarfb_1dcomm
-
-end module elpa_pdlarfb
diff -Nru elpa-2016.05.001/src/elpa_qr/elpa_qrkernels.f90 elpa-2019.11.001/src/elpa_qr/elpa_qrkernels.f90
--- elpa-2016.05.001/src/elpa_qr/elpa_qrkernels.f90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa_qr/elpa_qrkernels.f90	1970-01-01 00:00:00.000000000 +0000
@@ -1,783 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! calculates A = A - Y*T'*Z (rev=0)
-! calculates A = A - Y*T*Z (rev=1)
-! T upper triangle matrix
-! assuming zero entries in matrix in upper kxk block
-subroutine qr_pdlarfb_kernel_local(m,n,k,a,lda,v,ldv,t,ldt,z,ldz)
-    use precision
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: lda,ldv,ldt,ldz
-    real(kind=rk)    :: a(lda,*),v(ldv,*),t(ldt,*),z(ldz,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: m,n,k
-
-    ! local variables
-    real(kind=rk)    :: t11
-    real(kind=rk)    :: t12,t22,sum1,sum2
-    real(kind=rk)    :: t13,t23,t33,sum3
-    real(kind=rk)    :: sum4,t44
-    real(kind=rk)    :: y1,y2,y3,y4
-    real(kind=rk)    :: a1
-    integer(kind=ik) :: icol,irow,v1col,v2col,v3col
-
-    ! reference implementation
-    if (k .eq. 1) then
-        t11 = t(1,1)
-        do icol=1,n
-            sum1 = z(1,icol)
-            a(1:m,icol) = a(1:m,icol) - t11*sum1*v(1:m,1)
-        enddo
-        return
-    else if (k .eq. 2) then
-            v1col = 2
-            v2col = 1
-            t22 = t(1,1)
-            t12 = t(1,2)
-            t11 = t(2,2)
-
-        do icol=1,n
-            sum1 = t11 * z(v1col,icol)
-            sum2 = t12 * z(v1col,icol) + t22 * z(v2col,icol)
-
-            do irow=1,m
-                a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2
-            end do
-        end do
-    else if (k .eq. 3) then
-            v1col = 3
-            v2col = 2
-            v3col = 1
-
-            t33 = t(1,1)
-
-            t23 = t(1,2)
-            t22 = t(2,2)
-
-            t13 = t(1,3)
-            t12 = t(2,3)
-            t11 = t(3,3)
-
-        do icol=1,n
-            ! misusing variables for fetch of z parts
-            y1 = z(v1col,icol)
-            y2 = z(v2col,icol)
-            y3 = z(v3col,icol)
-
-            sum1 = t11 * y1!+ 0   * y2!+ 0   * y3
-            sum2 = t12 * y1 + t22 * y2!+ 0   * y3
-            sum3 = t13 * y1 + t23 * y2 + t33 * y3
-
-            do irow=1,m
-                a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2 - v(irow,v3col) * sum3
-            end do
-        end do
-    else if (k .eq. 4) then
-            do icol=1,n
-                ! misusing variables for fetch of z parts
-                y1 = z(1,icol)
-                y2 = z(2,icol)
-                y3 = z(3,icol)
-                y4 = z(4,icol)
-
-                ! dtrmv like - starting from main diagonal and working
-                ! upwards
-                t11 = t(1,1)
-                t22 = t(2,2)
-                t33 = t(3,3)
-                t44 = t(4,4)
-
-                sum1 = t11 * y1
-                sum2 = t22 * y2
-                sum3 = t33 * y3
-                sum4 = t44 * y4
-
-                t11 = t(1,2)
-                t22 = t(2,3)
-                t33 = t(3,4)
-
-                sum1 = sum1 + t11 * y2
-                sum2 = sum2 + t22 * y3
-                sum3 = sum3 + t33 * y4
-
-                t11 = t(1,3)
-                t22 = t(2,4)
-
-                sum1 = sum1 + t11 * y3
-                sum2 = sum2 + t22 * y4
-
-                t11 = t(1,4)
-                sum1 = sum1 + t11 * y4
-
-                ! one column of V is calculated
-                ! time to calculate A - Y * V
-                do irow=1,m ! TODO: loop unrolling
-                    y1 = v(irow,1)
-                    y2 = v(irow,2)
-                    y3 = v(irow,3)
-                    y4 = v(irow,4)
-
-                    a1 = a(irow,icol)
-
-                    a1 = a1 - y1*sum1
-                    a1 = a1 - y2*sum2
-                    a1 = a1 - y3*sum3
-                    a1 = a1 - y4*sum4
-
-                    a(irow,icol) = a1
-                end do
-            end do
-    else
-        ! reference implementation
-            ! V' = T * Z'
-            call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0d0,t,ldt,z,ldz)
-            ! A = A - Y * V'
-            call dgemm("Notrans","Notrans",m,n,k,-1.0d0,v,ldv,z,ldz,1.0d0,a,lda)
-    end if
-
-end subroutine
-subroutine qr_pdlarft_merge_kernel_local(oldk,k,t,ldt,yty,ldy)
-    use precision
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldt,ldy
-    real(kind=rk)    :: t(ldt,*),yty(ldy,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: k,oldk
-
-    ! output variables (global)
-
-    ! local scalars
-    integer(kind=ik) :: icol,leftk,rightk
-
-    ! local scalars for optimized versions
-    integer(kind=ik) :: irow
-    real(kind=rk)    :: t11
-    real(kind=rk)    :: yty1,yty2,yty3,yty4,yty5,yty6,yty7,yty8
-    real(kind=rk)    :: reg01,reg02,reg03,reg04,reg05,reg06,reg07,reg08
-    real(kind=rk)    :: final01,final02,final03,final04,final05,final06,final07,final08
-
-    if (oldk .eq. 0) return ! nothing to be done
-
-        leftk = k
-        rightk = oldk
-
-    ! optimized implementations:
-    if (leftk .eq. 1) then
-        do icol=1,rightk
-            ! multiply inner products with right t matrix
-            ! (dtrmv like)
-            yty1 = yty(1,1)
-            t11 = t(leftk+1,leftk+icol)
-
-            reg01 = yty1 * t11
-
-            do irow=2,icol
-                yty1 = yty(1,irow)
-                t11 = t(leftk+irow,leftk+icol)
-
-                reg01 = reg01 + yty1 * t11
-            end do
-
-            ! multiply intermediate results with left t matrix and store in final t
-            ! matrix
-            t11 = -t(1,1)
-            final01 = t11 * reg01
-            t(1,leftk+icol) = final01
-        end do
-
-        !print *,'efficient tmerge - leftk=1'
-    else if (leftk .eq. 2) then
-        do icol=1,rightk
-            ! multiply inner products with right t matrix
-            ! (dtrmv like)
-            yty1 = yty(1,1)
-            yty2 = yty(2,1)
-
-            t11  = t(leftk+1,leftk+icol)
-
-            reg01 = yty1 * t11
-            reg02 = yty2 * t11
-
-            do irow=2,icol
-                yty1 = yty(1,irow)
-                yty2 = yty(2,irow)
-                t11 = t(leftk+irow,leftk+icol)
-
-                reg01 = reg01 + yty1 * t11
-                reg02 = reg02 + yty2 * t11
-            end do
-
-            ! multiply intermediate results with left t matrix and store in final t
-            ! matrix
-            yty1 = -t(1,1)
-            yty2 = -t(1,2)
-            yty3 = -t(2,2)
-
-            final01 = reg02 * yty2
-            final02 = reg02 * yty3
-
-            final01 = final01 + reg01 * yty1
-
-            t(1,leftk+icol) = final01
-            t(2,leftk+icol) = final02
-        end do
-
-        !print *,'efficient tmerge - leftk=2'
-    else if (leftk .eq. 4) then
-        do icol=1,rightk
-            ! multiply inner products with right t matrix
-            ! (dtrmv like)
-            yty1 = yty(1,1)
-            yty2 = yty(2,1)
-            yty3 = yty(3,1)
-            yty4 = yty(4,1)
-
-            t11  = t(leftk+1,leftk+icol)
-
-            reg01 = yty1 * t11
-            reg02 = yty2 * t11
-            reg03 = yty3 * t11
-            reg04 = yty4 * t11
-
-            do irow=2,icol
-                yty1 = yty(1,irow)
-                yty2 = yty(2,irow)
-                yty3 = yty(3,irow)
-                yty4 = yty(4,irow)
-
-                t11 = t(leftk+irow,leftk+icol)
-
-                reg01 = reg01 + yty1 * t11
-                reg02 = reg02 + yty2 * t11
-                reg03 = reg03 + yty3 * t11
-                reg04 = reg04 + yty4 * t11
-            end do
-
-            ! multiply intermediate results with left t matrix and store in final t
-            ! matrix (start from diagonal and move upwards)
-            yty1 = -t(1,1)
-            yty2 = -t(2,2)
-            yty3 = -t(3,3)
-            yty4 = -t(4,4)
-
-            ! main diagonal
-            final01 = reg01 * yty1
-            final02 = reg02 * yty2
-            final03 = reg03 * yty3
-            final04 = reg04 * yty4
-
-            ! above main diagonal
-            yty1 = -t(1,2)
-            yty2 = -t(2,3)
-            yty3 = -t(3,4)
-
-            final01 = final01 + reg02 * yty1
-            final02 = final02 + reg03 * yty2
-            final03 = final03 + reg04 * yty3
-
-            ! above first side diagonal
-            yty1 = -t(1,3)
-            yty2 = -t(2,4)
-
-            final01 = final01 + reg03 * yty1
-            final02 = final02 + reg04 * yty2
-
-            ! above second side diagonal
-            yty1 = -t(1,4)
-
-            final01 = final01 + reg04 * yty1
-
-            ! write back to final matrix
-            t(1,leftk+icol) = final01
-            t(2,leftk+icol) = final02
-            t(3,leftk+icol) = final03
-            t(4,leftk+icol) = final04
-        end do
-
-        !print *,'efficient tmerge - leftk=4'
-    else if (leftk .eq. 8) then
-        do icol=1,rightk
-            ! multiply inner products with right t matrix
-            ! (dtrmv like)
-            yty1 = yty(1,1)
-            yty2 = yty(2,1)
-            yty3 = yty(3,1)
-            yty4 = yty(4,1)
-            yty5 = yty(5,1)
-            yty6 = yty(6,1)
-            yty7 = yty(7,1)
-            yty8 = yty(8,1)
-
-            t11  = t(leftk+1,leftk+icol)
-
-            reg01 = yty1 * t11
-            reg02 = yty2 * t11
-            reg03 = yty3 * t11
-            reg04 = yty4 * t11
-            reg05 = yty5 * t11
-            reg06 = yty6 * t11
-            reg07 = yty7 * t11
-            reg08 = yty8 * t11
-
-            do irow=2,icol
-                yty1 = yty(1,irow)
-                yty2 = yty(2,irow)
-                yty3 = yty(3,irow)
-                yty4 = yty(4,irow)
-                yty5 = yty(5,irow)
-                yty6 = yty(6,irow)
-                yty7 = yty(7,irow)
-                yty8 = yty(8,irow)
-
-                t11 = t(leftk+irow,leftk+icol)
-
-                reg01 = reg01 + yty1 * t11
-                reg02 = reg02 + yty2 * t11
-                reg03 = reg03 + yty3 * t11
-                reg04 = reg04 + yty4 * t11
-                reg05 = reg05 + yty5 * t11
-                reg06 = reg06 + yty6 * t11
-                reg07 = reg07 + yty7 * t11
-                reg08 = reg08 + yty8 * t11
-            end do
-
-            ! multiply intermediate results with left t matrix and store in final t
-            ! matrix (start from diagonal and move upwards)
-            yty1 = -t(1,1)
-            yty2 = -t(2,2)
-            yty3 = -t(3,3)
-            yty4 = -t(4,4)
-            yty5 = -t(5,5)
-            yty6 = -t(6,6)
-            yty7 = -t(7,7)
-            yty8 = -t(8,8)
-
-            ! main diagonal
-            final01 = reg01 * yty1
-            final02 = reg02 * yty2
-            final03 = reg03 * yty3
-            final04 = reg04 * yty4
-            final05 = reg05 * yty5
-            final06 = reg06 * yty6
-            final07 = reg07 * yty7
-            final08 = reg08 * yty8
-
-            ! above main diagonal
-            yty1 = -t(1,2)
-            yty2 = -t(2,3)
-            yty3 = -t(3,4)
-            yty4 = -t(4,5)
-            yty5 = -t(5,6)
-            yty6 = -t(6,7)
-            yty7 = -t(7,8)
-
-            final01 = final01 + reg02 * yty1
-            final02 = final02 + reg03 * yty2
-            final03 = final03 + reg04 * yty3
-            final04 = final04 + reg05 * yty4
-            final05 = final05 + reg06 * yty5
-            final06 = final06 + reg07 * yty6
-            final07 = final07 + reg08 * yty7
-
-            ! above first side diagonal
-            yty1 = -t(1,3)
-            yty2 = -t(2,4)
-            yty3 = -t(3,5)
-            yty4 = -t(4,6)
-            yty5 = -t(5,7)
-            yty6 = -t(6,8)
-
-            final01 = final01 + reg03 * yty1
-            final02 = final02 + reg04 * yty2
-            final03 = final03 + reg05 * yty3
-            final04 = final04 + reg06 * yty4
-            final05 = final05 + reg07 * yty5
-            final06 = final06 + reg08 * yty6
-
-            !above second side diagonal
-
-            yty1 = -t(1,4)
-            yty2 = -t(2,5)
-            yty3 = -t(3,6)
-            yty4 = -t(4,7)
-            yty5 = -t(5,8)
-
-            final01 = final01 + reg04 * yty1
-            final02 = final02 + reg05 * yty2
-            final03 = final03 + reg06 * yty3
-            final04 = final04 + reg07 * yty4
-            final05 = final05 + reg08 * yty5
-
-            ! i think you got the idea by now
-
-            yty1 = -t(1,5)
-            yty2 = -t(2,6)
-            yty3 = -t(3,7)
-            yty4 = -t(4,8)
-
-            final01 = final01 + reg05 * yty1
-            final02 = final02 + reg06 * yty2
-            final03 = final03 + reg07 * yty3
-            final04 = final04 + reg08 * yty4
-
-            ! .....
-
-            yty1 = -t(1,6)
-            yty2 = -t(2,7)
-            yty3 = -t(3,8)
-
-            final01 = final01 + reg06 * yty1
-            final02 = final02 + reg07 * yty2
-            final03 = final03 + reg08 * yty3
-
-            ! .....
-
-            yty1 = -t(1,7)
-            yty2 = -t(2,8)
-
-            final01 = final01 + reg07 * yty1
-            final02 = final02 + reg08 * yty2
-
-            ! .....
-
-            yty1 = -t(1,8)
-
-            final01 = final01 + reg08 * yty1
-
-            ! write back to final matrix
-            t(1,leftk+icol) = final01
-            t(2,leftk+icol) = final02
-            t(3,leftk+icol) = final03
-            t(4,leftk+icol) = final04
-            t(5,leftk+icol) = final05
-            t(6,leftk+icol) = final06
-            t(7,leftk+icol) = final07
-            t(8,leftk+icol) = final08
-        end do
-
-        !print *,'efficient tmerge - leftk=8'
-    else
-        ! reference implementation
-        do icol=1,rightk
-            t(1:leftk,leftk+icol) = yty(1:leftk,icol)
-        end do
-
-        ! -T1 * Y1'*Y2
-        call dtrmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0d0,t(1,1),ldt,t(1,leftk+1),ldt)
-        ! (-T1 * Y1'*Y2) * T2
-        call dtrmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0d0,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt)
-    end if
-
-end subroutine
-! yty structure
-! Y1'*Y2   Y1'*Y3  Y1'*Y4 ...
-!    0     Y2'*Y3  Y2'*Y4 ...
-!    0        0    Y3'*Y4 ...
-!    0        0       0   ...
-subroutine qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy)
-    use precision
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldt,ldy
-    real(kind=rk)    :: t(ldt,*),yty(ldy,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: k,blocksize
-
-    ! output variables (global)
-
-    ! local scalars
-    integer(kind=ik) :: nr_blocks,current_block
-    integer(kind=ik) :: remainder,oldk
-    integer(kind=ik) :: yty_column,toffset
-
-    if (k .le. blocksize) return ! nothing to merge
-
-    nr_blocks = k / blocksize
-    remainder = k - nr_blocks*blocksize
-
-        ! work in "negative" direction:
-        ! start with latest T matrix part and add older ones
-        toffset = 1
-        yty_column = 1
-
-        if (remainder .gt. 0) then
-            call qr_pdlarft_merge_kernel_local(blocksize,remainder,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
-            current_block = 1
-            oldk = remainder+blocksize
-            yty_column =  yty_column + blocksize
-        else
-            call qr_pdlarft_merge_kernel_local(blocksize,blocksize,t(toffset,toffset),ldt,yty(1,yty_column),ldy)
-            current_block = 2
-            oldk = 2*blocksize
-            yty_column = yty_column + blocksize
-        end if
-
-        do while (current_block .lt. nr_blocks)
-            call qr_pdlarft_merge_kernel_local(blocksize,oldk,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
-
-            current_block = current_block + 1
-            oldk = oldk + blocksize
-            yty_column = yty_column + blocksize
-        end do
-
-end subroutine
-! yty structure
-! Y1'*Y2   Y1'*Y3  Y1'*Y4 ...
-!    0     Y2'*Y3  Y2'*Y4 ...
-!    0        0    Y3'*Y4 ...
-!    0        0       0   ...
-
-subroutine qr_tmerge_tree_kernel(k,blocksize,treeorder,t,ldt,yty,ldy)
-    use precision
-    implicit none
-
-    ! input variables (local)
-    integer(kind=ik) :: ldt,ldy
-    real(kind=rk)    :: t(ldt,*),yty(ldy,*)
-
-    ! input variables (global)
-    integer(kind=ik) :: k,blocksize,treeorder
-
-    ! output variables (global)
-
-    ! local scalars
-    integer temp_blocksize,nr_sets,current_set,setsize,nr_blocks
-    integer remainder,max_treeorder,remaining_size
-    integer toffset,yty_column
-    integer toffset_start,yty_column_start
-    integer yty_end,total_remainder,yty_remainder
-
-    if (treeorder .eq. 0) return ! no merging
-
-    if (treeorder .eq. 1) then
-        call qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy)
-        return
-    end if
-
-    nr_blocks = k / blocksize
-    max_treeorder = min(nr_blocks,treeorder)
-
-    if (max_treeorder .eq. 1) then
-        call qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy)
-        return
-    end if
-
-        ! work in "negative" direction: from latest set to oldest set
-        ! implementation differs from rev=0 version due to issues with
-        ! calculating the remainder parts
-        ! compared to the rev=0 version we split remainder parts directly from
-        ! parts which can be easily merged in a recursive way
-
-        yty_end = (k / blocksize) * blocksize
-        if (yty_end .eq. k) then
-            yty_end = yty_end - blocksize
-        end if
-
-        !print *,'tree',yty_end,k,blocksize
-
-        yty_column_start = 1
-        toffset_start = 1
-
-        ! is there a remainder block?
-        nr_blocks = k / blocksize
-        remainder = k - nr_blocks * blocksize
-        if (remainder .eq. 0) then
-            !print *,'no initial remainder'
-
-            ! set offsets to the very beginning as there is no remainder part
-            yty_column_start = 1
-            toffset_start = 1
-            total_remainder = 0
-            remaining_size = k
-            yty_remainder = 0
-        else
-            !print *,'starting with initial remainder'
-            ! select submatrix and make remainder block public
-            yty_column_start = 1 + blocksize
-            toffset_start = 1 + remainder
-            total_remainder = remainder
-            remaining_size = k - remainder
-            yty_remainder = 1
-        end if
-
-        ! from now on it is a clean set of blocks with sizes of multiple of
-        ! blocksize
-
-        temp_blocksize = blocksize
-
-        !-------------------------------
-        do while (remaining_size .gt. 0)
-            nr_blocks = remaining_size / temp_blocksize
-            max_treeorder = min(nr_blocks,treeorder)
-
-            if (max_treeorder .eq. 1) then
-                remainder = 0
-                nr_sets = 0
-                setsize = 0
-
-                if (yty_remainder .gt. 0) then
-                    yty_column = yty_remainder
-                    !print *,'final merging with remainder',temp_blocksize,k,remaining_size,yty_column
-                    call qr_tmerge_set_kernel(k,temp_blocksize,t,ldt,yty(1,yty_column),ldy)
-                else
-                    !print *,'no remainder - no merging needed',temp_blocksize,k,remaining_size
-                endif
-
-                remaining_size = 0
-
-                return ! done
-            else
-                nr_sets = nr_blocks / max_treeorder
-                setsize = max_treeorder*temp_blocksize
-                remainder = remaining_size - nr_sets*setsize
-            end if
-
-            if (remainder .gt. 0) then
-                if (remainder .gt. temp_blocksize) then
-                    toffset = toffset_start
-                    yty_column = yty_column_start
-
-                    !print *,'set merging', toffset, yty_column,remainder
-                    call qr_tmerge_set_kernel(remainder,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
-
-                    if (total_remainder .gt. 0) then
-                        ! merge with existing global remainder part
-                        !print *,'single+set merging',yty_remainder,total_remainder,remainder
-
-                        call qr_pdlarft_merge_kernel_local(remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
-
-                        yty_remainder = yty_remainder + remainder
-                        toffset_start = toffset_start + remainder
-
-                        !print *,'single+set merging (new offsets)',yty_remainder,yty_column_start,toffset_start
-
-                        yty_column_start = yty_column_start + remainder
-                    else
-                        ! create new remainder part
-                        !print *,'new remainder+set',yty_remainder
-                        yty_remainder = yty_column_start + remainder - temp_blocksize
-                        yty_column_start = yty_column_start + remainder
-                        toffset_start = toffset_start + remainder
-                        !print *,'new remainder+set (new offsets)',yty_remainder,yty_column_start,toffset_start
-                    end if
-
-                else
-                    if (total_remainder .gt. 0) then
-                        ! merge with existing global remainder part
-                        !print *,'single merging',yty_remainder,total_remainder,remainder
-
-                        call qr_pdlarft_merge_kernel_local(remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy)
-
-                        yty_remainder = yty_remainder + remainder
-                        toffset_start = toffset_start + remainder
-
-                        !print *,'single merging (new offsets)',yty_remainder,yty_column_start,toffset_start
-
-                        yty_column_start = yty_column_start + remainder
-                    else
-                        ! create new remainder part
-                        !print *,'new remainder',yty_remainder
-                        yty_remainder = yty_column_start
-                        yty_column_start = yty_column_start + temp_blocksize
-                        toffset_start = toffset_start + remainder
-                        !print *,'new remainder (new offsets)',yty_remainder,yty_column_start,toffset_start
-                    end if
-                end if
-
-                total_remainder = total_remainder + remainder
-                remaining_size = remaining_size - remainder
-            end if
-
-            current_set = 0
-            do while (current_set .lt. nr_sets)
-                toffset = toffset_start + current_set * setsize
-                yty_column = yty_column_start + current_set * setsize
-
-                !print *,'recursive merging', toffset, yty_column,setsize
-
-                call qr_tmerge_set_kernel(setsize,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy)
-
-                current_set = current_set +  1
-            end do
-
-            !print *,'increasing blocksize', temp_blocksize, setsize
-            yty_column_start = yty_column_start + (setsize - temp_blocksize)
-            temp_blocksize = setsize
-        end do
-end subroutine
-! yty should not contain the inner products vi'*vi
-subroutine qr_dlarft_kernel(n,tau,yty,ldy,t,ldt)
-    use precision
-    implicit none
-
-    ! input variables
-    integer(kind=ik) :: n,ldy,ldt
-    real(kind=rk)    :: tau(*),yty(ldy,*)
-
-    ! output variables
-    real(kind=rk)    :: t(ldt,*)
-
-    ! local variables
-    integer(kind=ik) :: icol
-
-    ! DEBUG: clear buffer first
-    !t(1:n,1:n) = 0.0d0
-
-        ! T1 = tau1
-        ! | tauk  Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' |
-        ! | 0           Tk-1                           |
-        t(n,n) = tau(n)
-        do icol=n-1,1,-1
-            t(icol,icol+1:n) = -tau(icol)*yty(icol,icol:n-1)
-            call dtrmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt)
-            t(icol,icol) = tau(icol)
-        end do
-end subroutine
diff -Nru elpa-2016.05.001/src/elpa_qr/qr_utils.F90 elpa-2019.11.001/src/elpa_qr/qr_utils.F90
--- elpa-2016.05.001/src/elpa_qr/qr_utils.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa_qr/qr_utils.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,402 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-
-module qr_utils_mod
-    use elpa_mpi
-    implicit none
-
-    PRIVATE
-
-    public :: local_size_offset_1d
-    public :: reverse_vector_local
-    public :: reverse_matrix_local
-    public :: reverse_matrix_1dcomm
-    public :: reverse_matrix_2dcomm_ref
-
-contains
-
-! rev parameter is critical, even in rev only mode!
-! pdgeqrf_2dcomm uses rev=0 version to determine the process columns
-! involved in the qr decomposition
-subroutine local_size_offset_1d(n,nb,baseidx,idx,rev,rank,nprocs, &
-                                lsize,baseoffset,offset)
-
-    use precision
-    use ELPA1_compute
-
-    implicit none
-
-    ! input
-    integer(kind=ik) :: n,nb,baseidx,idx,rev,rank,nprocs
-
-    ! output
-    integer(kind=ik) :: lsize,baseoffset,offset
-
-    ! local scalars
-    integer(kind=ik) :: rank_idx
-
-    rank_idx = MOD((idx-1)/nb,nprocs)
-
-    ! calculate local size and offsets
-    if (rev .eq. 1) then
-        if (idx > 0) then
-            lsize = local_index(idx,rank,nprocs,nb,-1)
-        else
-            lsize = 0
-        end if
-
-        baseoffset = 1
-        offset = 1
-    else
-        offset = local_index(idx,rank,nprocs,nb,1)
-        baseoffset = local_index(baseidx,rank,nprocs,nb,1)
-
-        lsize = local_index(n,rank,nprocs,nb,-1)
-        !print *,'baseidx,idx',baseidx,idx,lsize,n
-
-        lsize = lsize - offset + 1
-
-        baseoffset = offset - baseoffset + 1
-    end if
-
-end subroutine local_size_offset_1d
-
-
-subroutine reverse_vector_local(n,x,incx,work,lwork)
-    use precision
-    implicit none
-
-    ! input
-    integer(kind=ik) :: incx,n,lwork
-    real(kind=rk)    :: x(*),work(*)
-
-    ! local scalars
-    real(kind=rk)    :: temp
-    integer(kind=ik) :: srcoffset,destoffset,ientry
-
-    if (lwork .eq. -1) then
-        work(1) = 0.0d0
-        return
-    end if
-
-    do ientry=1,n/2
-        srcoffset=1+(ientry-1)*incx
-        destoffset=1+(n-ientry)*incx
-
-        temp = x(srcoffset)
-        x(srcoffset) = x(destoffset)
-        x(destoffset) = temp
-    end do
-
-end subroutine reverse_vector_local
-
-subroutine reverse_matrix_local(trans,m,n,a,lda,work,lwork)
-    use precision
-    implicit none
-
-    ! input
-    integer(kind=ik) :: lda,m,n,lwork,trans
-    real(kind=rk)    :: a(lda,*),work(*)
-
-    ! local scalars
-    real(kind=rk)    :: temp, dworksize(1)
-    integer(kind=ik) :: incx
-    integer(kind=ik) :: dimsize
-    integer(kind=ik) :: i
-
-    if (trans .eq. 1) then
-        incx = lda
-        dimsize = n
-    else
-        incx = 1
-        dimsize = m
-    end if
-
-    if (lwork .eq. -1) then
-        call reverse_vector_local(dimsize,a,incx,dworksize,-1)
-        work(1) = dworksize(1)
-        return
-    end if
-
-    if (trans .eq. 1) then
-        do i=1,m
-            call reverse_vector_local(dimsize,a(i,1),incx,work,lwork)
-        end do
-    else
-        do i=1,n
-            call reverse_vector_local(dimsize,a(1,i),incx,work,lwork)
-        end do
-    end if
-
-end subroutine reverse_matrix_local
-
-subroutine reverse_matrix_2dcomm_ref(m,n,mb,nb,a,lda,work,lwork,mpicomm_cols,mpicomm_rows)
-    use precision
-    implicit none
-
-    ! input
-    integer(kind=ik) :: m,n,lda,lwork,mpicomm_cols,mpicomm_rows,mb,nb
-    real(kind=rk)    :: a(lda,*),work(*)
-
-    ! local scalars
-    real(kind=rk)    :: reverse_column_size(1)
-    real(kind=rk)    :: reverse_row_size(1)
-
-    integer(kind=ik) :: mpirank_cols,mpirank_rows
-    integer(kind=ik) :: mpiprocs_cols,mpiprocs_rows
-    integer(kind=ik) :: mpierr
-    integer(kind=ik) :: lrows,lcols,offset,baseoffset
-    call MPI_Comm_rank(mpicomm_cols,mpirank_cols,mpierr)
-    call MPI_Comm_rank(mpicomm_rows,mpirank_rows,mpierr)
-    call MPI_Comm_size(mpicomm_cols,mpiprocs_cols,mpierr)
-    call MPI_Comm_size(mpicomm_rows,mpiprocs_rows,mpierr)
-    call local_size_offset_1d(m,mb,1,1,0,mpirank_cols,mpiprocs_cols, &
-                                  lrows,baseoffset,offset)
-
-    call local_size_offset_1d(n,nb,1,1,0,mpirank_rows,mpiprocs_rows, &
-                                  lcols,baseoffset,offset)
-
-    if (lwork .eq. -1) then
-        call reverse_matrix_1dcomm(0,m,lcols,mb,a,lda,reverse_column_size,-1,mpicomm_cols)
-        call reverse_matrix_1dcomm(1,lrows,n,nb,a,lda,reverse_row_size,-1,mpicomm_rows)
-        work(1) = max(reverse_column_size(1),reverse_row_size(1))
-        return
-    end if
-
-    call reverse_matrix_1dcomm(0,m,lcols,mb,a,lda,work,lwork,mpicomm_cols)
-    call reverse_matrix_1dcomm(1,lrows,n,nb,a,lda,work,lwork,mpicomm_rows)
-end subroutine reverse_matrix_2dcomm_ref
-
-! b: if trans = 'N': b is size of block distribution between rows
-! b: if trans = 'T': b is size of block distribution between columns
-subroutine reverse_matrix_1dcomm(trans,m,n,b,a,lda,work,lwork,mpicomm)
-    use precision
-    use elpa_mpi
-
-    implicit none
-
-    ! input
-    integer(kind=ik) :: trans
-    integer(kind=ik) :: m,n,b,lda,lwork,mpicomm
-    real(kind=rk)    :: a(lda,*),work(*)
-
-    ! local scalars
-    integer(kind=ik) :: mpirank,mpiprocs,mpierr
-#ifdef WITH_MPI
-    integer(kind=ik) :: mpistatus(MPI_STATUS_SIZE)
-#endif
-    integer(kind=ik) :: nr_blocks,dest_process,src_process,step
-    integer(kind=ik) :: lsize,baseoffset,offset
-    integer(kind=ik) :: current_index,destblk,srcblk,icol,next_index
-    integer(kind=ik) :: sendcount,recvcount
-    integer(kind=ik) :: sendoffset,recvoffset
-    integer(kind=ik) :: newmatrix_offset,work_offset
-    integer(kind=ik) :: lcols,lrows,lroffset,lcoffset,dimsize,fixedsize
-    real(kind=rk)    :: dworksize(1)
-    call MPI_Comm_rank(mpicomm, mpirank, mpierr)
-    call MPI_Comm_size(mpicomm, mpiprocs, mpierr)
-    if (trans .eq. 1) then
-        call local_size_offset_1d(n,b,1,1,0,mpirank,mpiprocs, &
-                                  lcols,baseoffset,lcoffset)
-        lrows = m
-    else
-        call local_size_offset_1d(m,b,1,1,0,mpirank,mpiprocs, &
-                                  lrows,baseoffset,lroffset)
-        lcols = n
-    end if
-
-    if (lwork .eq. -1) then
-        call reverse_matrix_local(trans,lrows,lcols,a,max(lrows,lcols),dworksize,-1)
-        work(1) = DBLE(3*lrows*lcols) + dworksize(1)
-        return
-    end if
-
-    sendoffset = 1
-    recvoffset = sendoffset + lrows*lcols
-    newmatrix_offset = recvoffset + lrows*lcols
-    work_offset = newmatrix_offset + lrows*lcols
-
-    if (trans .eq. 1) then
-        dimsize = n
-        fixedsize = m
-    else
-        dimsize = m
-        fixedsize = n
-    end if
-
-    if (dimsize .le. 1) then
-        return ! nothing to do
-    end if
-
-    ! 1. adjust step size to remainder size
-    nr_blocks = dimsize / b
-    nr_blocks = nr_blocks * b
-    step = dimsize - nr_blocks
-    if (step .eq. 0) step = b
-
-    ! 2. iterate over destination blocks starting with process 0
-    current_index = 1
-    do while (current_index .le. dimsize)
-        destblk = (current_index-1) / b
-        dest_process = mod(destblk,mpiprocs)
-        srcblk = (dimsize-current_index) / b
-        src_process = mod(srcblk,mpiprocs)
-
-        next_index = current_index+step
-
-        ! block for dest_process is located on mpirank if lsize > 0
-        call local_size_offset_1d(dimsize-current_index+1,b,dimsize-next_index+2,dimsize-next_index+2,0, &
-                                  src_process,mpiprocs,lsize,baseoffset,offset)
-
-        sendcount = lsize*fixedsize
-        recvcount = sendcount
-
-        ! TODO: this send/recv stuff seems to blow up on BlueGene/P
-        ! TODO: is there actually room for the requested matrix part? the target
-        ! process might not have any parts at all (thus no room)
-        if ((src_process .eq. mpirank) .and. (dest_process .eq. src_process)) then
-                ! 5. pack data
-                if (trans .eq. 1) then
-                    do icol=offset,offset+lsize-1
-                        work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = &
-                            a(1:lrows,icol)
-                    end do
-                else
-                    do icol=1,lcols
-                        work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = &
-                            a(offset:offset+lsize-1,icol)
-                    end do
-                end if
-
-                ! 7. reverse data
-                if (trans .eq. 1) then
-                    call reverse_matrix_local(1,lrows,lsize,work(sendoffset),lrows,work(work_offset),lwork)
-                else
-                    call reverse_matrix_local(0,lsize,lcols,work(sendoffset),lsize,work(work_offset),lwork)
-                end if
-
-                ! 8. store in temp matrix
-                if (trans .eq. 1) then
-                    do icol=1,lsize
-                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = &
-                            work(sendoffset+(icol-1)*lrows:sendoffset+icol*lrows-1)
-                    end do
-
-                    newmatrix_offset = newmatrix_offset + lsize*lrows
-                else
-                    do icol=1,lcols
-                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = &
-                            work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1)
-                    end do
-
-                    newmatrix_offset = newmatrix_offset + lsize
-                end if
-        else
-
-            if (dest_process .eq. mpirank) then
-                ! 6b. call MPI_Recv
-#ifdef WITH_MPI
-                call MPI_Recv(work(recvoffset), recvcount, mpi_real8, &
-                              src_process, current_index, mpicomm, mpistatus, mpierr)
-#else
-                work(recvoffset:recvoffset+recvcount-1) = work(sendoffset:sendoffset+sendcount-1)
-#endif
-                ! 7. reverse data
-                if (trans .eq. 1) then
-                    call reverse_matrix_local(1,lrows,lsize,work(recvoffset),lrows,work(work_offset),lwork)
-                else
-                    call reverse_matrix_local(0,lsize,lcols,work(recvoffset),lsize,work(work_offset),lwork)
-                end if
-
-                ! 8. store in temp matrix
-                if (trans .eq. 1) then
-                    do icol=1,lsize
-                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = &
-                            work(recvoffset+(icol-1)*lrows:recvoffset+icol*lrows-1)
-                    end do
-
-                    newmatrix_offset = newmatrix_offset + lsize*lrows
-                else
-                    do icol=1,lcols
-                        work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = &
-                            work(recvoffset+(icol-1)*lsize:recvoffset+icol*lsize-1)
-                    end do
-
-                    newmatrix_offset = newmatrix_offset + lsize
-                end if
-            end if
-
-            if (src_process .eq. mpirank) then
-                ! 5. pack data
-                if (trans .eq. 1) then
-                    do icol=offset,offset+lsize-1
-                        work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = &
-                            a(1:lrows,icol)
-                    end do
-                else
-                    do icol=1,lcols
-                        work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = &
-                            a(offset:offset+lsize-1,icol)
-                    end do
-                end if
-
-                ! 6a. call MPI_Send
-#ifdef WITH_MPI
-                call MPI_Send(work(sendoffset), sendcount, mpi_real8, &
-                                  dest_process, current_index, mpicomm, mpierr)
-#endif
-            end if
-        end if
-
-        current_index = next_index
-    end do
-
-   ! 9. copy temp matrix to real matrix
-   newmatrix_offset = recvoffset + lrows*lcols
-   do icol=1,lcols
-        a(1:lrows,icol) = &
-            work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1)
-   end do
-end subroutine reverse_matrix_1dcomm
-end module
diff -Nru elpa-2016.05.001/src/elpa_reduce_add_vectors.X90 elpa-2019.11.001/src/elpa_reduce_add_vectors.X90
--- elpa-2016.05.001/src/elpa_reduce_add_vectors.X90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa_reduce_add_vectors.X90	1970-01-01 00:00:00.000000000 +0000
@@ -1,188 +0,0 @@
-#if 0
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MPCDF
-#endif
-
-#if REALCASE==1
-subroutine elpa_reduce_add_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk)
-#endif
-#if COMPLEXCASE==1
-subroutine elpa_reduce_add_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk)
-#endif
-
-!-------------------------------------------------------------------------------
-! This routine does a reduce of all vectors in vmat_s over the communicator comm_t.
-! The result of the reduce is gathered on the processors owning the diagonal
-! and added to the array of vectors vmat_t (which is distributed over comm_t).
-!
-! Opposed to elpa_transpose_vectors, there is NO identical copy of vmat_s
-! in the different members within vmat_t (else a reduce wouldn't be necessary).
-! After this routine, an allreduce of vmat_t has to be done.
-!
-! vmat_s    array of vectors to be reduced and added
-! ld_s      leading dimension of vmat_s
-! comm_s    communicator over which vmat_s is distributed
-! vmat_t    array of vectors to which vmat_s is added
-! ld_t      leading dimension of vmat_t
-! comm_t    communicator over which vmat_t is distributed
-! nvr       global length of vmat_s/vmat_t
-! nvc       number of columns in vmat_s/vmat_t
-! nblk      block size of block cyclic distribution
-!
-!-------------------------------------------------------------------------------
-
-   use precision
-!   use ELPA1 ! for least_common_multiple
-#ifdef WITH_OPENMP
-   use omp_lib
-#endif
-   use elpa_mpi
-
-   implicit none
-
-
-   integer(kind=ik), intent(in)              :: ld_s, comm_s, ld_t, comm_t, nvr, nvc, nblk
-   DATATYPE, intent(in)                      :: vmat_s(ld_s,nvc)
-   DATATYPE, intent(inout)                   :: vmat_t(ld_t,nvc)
-
-   DATATYPE, allocatable                     :: aux1(:), aux2(:)
-   integer(kind=ik)                          :: myps, mypt, nps, npt
-   integer(kind=ik)                          :: n, lc, k, i, ips, ipt, ns, nl, mpierr
-   integer(kind=ik)                          :: lcm_s_t, nblks_tot
-   integer(kind=ik)                          :: auxstride, tylerk, error_unit
-
-   call mpi_comm_rank(comm_s,myps,mpierr)
-   call mpi_comm_size(comm_s,nps ,mpierr)
-   call mpi_comm_rank(comm_t,mypt,mpierr)
-   call mpi_comm_size(comm_t,npt ,mpierr)
-
-   ! Look to elpa_transpose_vectors for the basic idea!
-
-   ! The communictation pattern repeats in the global matrix after
-   ! the least common multiple of (nps,npt) blocks
-
-   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt
-
-   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr
-
-   allocate(aux1( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
-   allocate(aux2( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
-   aux1(:) = 0
-   aux2(:) = 0
-#ifdef WITH_OPENMP
-   !$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl)
-#endif
-   do n = 0, lcm_s_t-1
-
-      ips = mod(n,nps)
-      ipt = mod(n,npt)
-
-      auxstride = nblk * ((nblks_tot - n + lcm_s_t - 1)/lcm_s_t)
-
-      if(myps == ips) then
-
-!         k = 0
-#ifdef WITH_OPENMP
-         !$omp do
-#endif
-         do lc=1,nvc
-            do i = n, nblks_tot-1, lcm_s_t
-	       k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
-               ns = (i/nps)*nblk ! local start of block i
-               nl = min(nvr-i*nblk,nblk) ! length
-               aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
-!               k = k+nblk
-            enddo
-         enddo
-
-         k = nvc * auxstride
-#ifdef WITH_OPENMP
-         !$omp barrier
-         !$omp master
-#endif
-#ifdef WITH_MPI
-
-#if REALCASE==1
-         if(k>0) call mpi_reduce(aux1,aux2,k,MPI_REAL8,MPI_SUM,ipt,comm_t,mpierr)
-#endif
-
-#if COMPLEXCASE==1
-         if(k>0) call mpi_reduce(aux1,aux2,k,MPI_DOUBLE_COMPLEX,MPI_SUM,ipt,comm_t,mpierr)
-#endif
-
-#else /* WITH_MPI */
-         if(k>0) aux2 = aux1
-#endif /* WITH_MPI */
-#ifdef WITH_OPENMP
-         !$omp end master
-         !$omp barrier
-#endif
-         if (mypt == ipt) then
-!            k = 0
-#ifdef WITH_OPENMP
-         !$omp do
-#endif
-            do lc=1,nvc
-               do i = n, nblks_tot-1, lcm_s_t
-	          k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
-                  ns = (i/npt)*nblk ! local start of block i
-                  nl = min(nvr-i*nblk,nblk) ! length
-                  vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl)
-!                  k = k+nblk
-               enddo
-            enddo
-         endif
-
-      endif
-
-   enddo
-#ifdef WITH_OPENMP
-   !$omp end parallel
-#endif
-
-   deallocate(aux1)
-   deallocate(aux2)
-
-end subroutine
-
-
diff -Nru elpa-2016.05.001/src/elpa_transpose_vectors.X90 elpa-2019.11.001/src/elpa_transpose_vectors.X90
--- elpa-2016.05.001/src/elpa_transpose_vectors.X90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/elpa_transpose_vectors.X90	1970-01-01 00:00:00.000000000 +0000
@@ -1,195 +0,0 @@
-#if 0
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-! Author: Andreas Marek, MPCDF
-#endif
-
-#if REALCASE==1
-subroutine elpa_transpose_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk)
-#endif
-#if COMPLEXCASE==1
-subroutine elpa_transpose_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk)
-#endif
-
-!-------------------------------------------------------------------------------
-! This routine transposes an array of vectors which are distributed in
-! communicator comm_s into its transposed form distributed in communicator comm_t.
-! There must be an identical copy of vmat_s in every communicator comm_s.
-! After this routine, there is an identical copy of vmat_t in every communicator comm_t.
-!
-! vmat_s    original array of vectors
-! ld_s      leading dimension of vmat_s
-! comm_s    communicator over which vmat_s is distributed
-! vmat_t    array of vectors in transposed form
-! ld_t      leading dimension of vmat_t
-! comm_t    communicator over which vmat_t is distributed
-! nvs       global index where to start in vmat_s/vmat_t
-!           Please note: this is kind of a hint, some values before nvs will be
-!           accessed in vmat_s/put into vmat_t
-! nvr       global length of vmat_s/vmat_t
-! nvc       number of columns in vmat_s/vmat_t
-! nblk      block size of block cyclic distribution
-!
-!-------------------------------------------------------------------------------
-    use precision
-
-!   use ELPA1 ! for least_common_multiple
-#ifdef WITH_OPENMP
-   use omp_lib
-#endif
-
-   use elpa_mpi
-
-   implicit none
-
-
-
-   integer(kind=ik), intent(in)              :: ld_s, comm_s, ld_t, comm_t, nvs, nvr, nvc, nblk
-   DATATYPE, intent(in)                      :: vmat_s(ld_s,nvc)
-   DATATYPE, intent(inout)                   :: vmat_t(ld_t,nvc)
-
-   DATATYPE, allocatable                     :: aux(:)
-   integer(kind=ik)                          :: myps, mypt, nps, npt
-   integer(kind=ik)                          :: n, lc, k, i, ips, ipt, ns, nl, mpierr
-   integer(kind=ik)                          :: lcm_s_t, nblks_tot, nblks_comm, nblks_skip
-   integer(kind=ik)                          :: auxstride
-
-   call mpi_comm_rank(comm_s,myps,mpierr)
-   call mpi_comm_size(comm_s,nps ,mpierr)
-   call mpi_comm_rank(comm_t,mypt,mpierr)
-   call mpi_comm_size(comm_t,npt ,mpierr)
-
-   ! The basic idea of this routine is that for every block (in the block cyclic
-   ! distribution), the processor within comm_t which owns the diagonal
-   ! broadcasts its values of vmat_s to all processors within comm_t.
-   ! Of course this has not to be done for every block separately, since
-   ! the communictation pattern repeats in the global matrix after
-   ! the least common multiple of (nps,npt) blocks
-
-   lcm_s_t   = least_common_multiple(nps,npt) ! least common multiple of nps, npt
-
-   nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr
-
-   ! Get the number of blocks to be skipped at the begin.
-   ! This must be a multiple of lcm_s_t (else it is getting complicated),
-   ! thus some elements before nvs will be accessed/set.
-
-   nblks_skip = ((nvs-1)/(nblk*lcm_s_t))*lcm_s_t
-
-   allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
-#ifdef WITH_OPENMP
-   !$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
-#endif
-   do n = 0, lcm_s_t-1
-
-      ips = mod(n,nps)
-      ipt = mod(n,npt)
-
-      if(mypt == ipt) then
-
-         nblks_comm = (nblks_tot-nblks_skip-n+lcm_s_t-1)/lcm_s_t
-         auxstride = nblk * nblks_comm
-!         if(nblks_comm==0) cycle
-         if (nblks_comm .ne. 0) then
-         if(myps == ips) then
-!            k = 0
-#ifdef WITH_OPENMP
-            !$omp do
-#endif
-            do lc=1,nvc
-               do i = nblks_skip+n, nblks_tot-1, lcm_s_t
-                  k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
-                  ns = (i/nps)*nblk ! local start of block i
-                  nl = min(nvr-i*nblk,nblk) ! length
-                  aux(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
-!                  k = k+nblk
-               enddo
-            enddo
-         endif
-
-#ifdef WITH_OPENMP
-         !$omp barrier
-         !$omp master
-#endif
-#ifdef WITH_MPI
-
-#if COMPLEXCASE==1
-         call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_DOUBLE_COMPLEX,ips,comm_s,mpierr)
-#endif
-
-#if REALCASE==1
-         call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_REAL8,ips,comm_s,mpierr)
-#endif
-
-#endif /* WITH_MPI */
-
-#ifdef WITH_OPENMP
-         !$omp end master
-         !$omp barrier
-
-         !$omp do
-#endif
-!         k = 0
-         do lc=1,nvc
-            do i = nblks_skip+n, nblks_tot-1, lcm_s_t
-               k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
-               ns = (i/npt)*nblk ! local start of block i
-               nl = min(nvr-i*nblk,nblk) ! length
-               vmat_t(ns+1:ns+nl,lc) = aux(k+1:k+nl)
-!               k = k+nblk
-            enddo
-         enddo
-         endif
-      endif
-
-   enddo
-#ifdef WITH_OPENMP
-   !$omp end parallel
-#endif
-   deallocate(aux)
-
-end subroutine
-
diff -Nru elpa-2016.05.001/src/elpa_utilities.F90 elpa-2019.11.001/src/elpa_utilities.F90
--- elpa-2016.05.001/src/elpa_utilities.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/src/elpa_utilities.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,126 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-! Author: Andreas Marek, MPCDF
-
-#include "config-f90.h"
-
-module ELPA_utilities
-
-#ifdef HAVE_ISO_FORTRAN_ENV
-  use iso_fortran_env, only : error_unit
-#endif
-  use precision
-  implicit none
-
-  private ! By default, all routines contained are private
-
-  public :: debug_messages_via_environment_variable, pcol, prow, error_unit
-#ifndef HAVE_ISO_FORTRAN_ENV
-  integer(kind=ik), parameter :: error_unit = 0
-#endif
-
-
-  !******
-  contains
-
-   function debug_messages_via_environment_variable() result(isSet)
-#ifdef HAVE_DETAILED_TIMINGS
-     use timings
-#endif
-     use precision
-     implicit none
-     logical              :: isSet
-     CHARACTER(len=255)   :: ELPA_DEBUG_MESSAGES
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%start("debug_messages_via_environment_variable")
-#endif
-
-     isSet = .false.
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-     call get_environment_variable("ELPA_DEBUG_MESSAGES",ELPA_DEBUG_MESSAGES)
-#endif
-     if (trim(ELPA_DEBUG_MESSAGES) .eq. "yes") then
-       isSet = .true.
-     endif
-     if (trim(ELPA_DEBUG_MESSAGES) .eq. "no") then
-       isSet = .true.
-     endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-     call timer%stop("debug_messages_via_environment_variable")
-#endif
-
-   end function debug_messages_via_environment_variable
-
-!-------------------------------------------------------------------------------
-
-  !Processor col for global col number
-  pure function pcol(i, nblk, np_cols) result(col)
-    use precision
-    implicit none
-    integer(kind=ik), intent(in) :: i, nblk, np_cols
-    integer(kind=ik)             :: col
-    col = MOD((i-1)/nblk,np_cols)
-  end function
-
-!-------------------------------------------------------------------------------
-
-  !Processor row for global row number
-  pure function prow(i, nblk, np_rows) result(row)
-    use precision
-    implicit none
-    integer(kind=ik), intent(in) :: i, nblk, np_rows
-    integer(kind=ik)             :: row
-    row = MOD((i-1)/nblk,np_rows)
-  end function
-
-!-------------------------------------------------------------------------------
-
-end module ELPA_utilities
diff -Nru elpa-2016.05.001/src/fortran_constants.h elpa-2019.11.001/src/fortran_constants.h
--- elpa-2016.05.001/src/fortran_constants.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/fortran_constants.h	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,30 @@
+#include <elpa/elpa_constants.h>
+
+#define FORTRAN_CONSTANT(name, value, ...) \
+        integer(kind=C_INT), parameter :: name = value !ELPA_C_DEFINE
+
+! General constants
+ ELPA_FOR_ALL_ERRORS(FORTRAN_CONSTANT)
+
+
+! Solver constants
+ ELPA_FOR_ALL_SOLVERS(FORTRAN_CONSTANT)
+#undef ELPA_NUMBER_OF_SOLVERS
+ FORTRAN_CONSTANT(ELPA_NUMBER_OF_SOLVERS, (0 ELPA_FOR_ALL_SOLVERS(ELPA_ENUM_SUM)))
+
+
+! Real kernels
+ ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(FORTRAN_CONSTANT)
+#undef ELPA_2STAGE_NUMBER_OF_REAL_KERNELS
+ FORTRAN_CONSTANT(ELPA_2STAGE_NUMBER_OF_REAL_KERNELS, & NEWLINE (0 ELPA_FOR_ALL_2STAGE_REAL_KERNELS(ELPA_ENUM_SUM)))
+
+
+! Complex kernels
+ ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(FORTRAN_CONSTANT)
+#undef ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS
+ FORTRAN_CONSTANT(ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS, & NEWLINE (0 ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(ELPA_ENUM_SUM)))
+
+
+! Autotune
+ ELPA_FOR_ALL_AUTOTUNE_LEVELS(FORTRAN_CONSTANT)
+ ELPA_FOR_ALL_AUTOTUNE_DOMAINS(FORTRAN_CONSTANT)
diff -Nru elpa-2016.05.001/src/ftimings/ftimings.F90 elpa-2019.11.001/src/ftimings/ftimings.F90
--- elpa-2016.05.001/src/ftimings/ftimings.F90	2016-02-26 14:11:56.000000000 +0000
+++ elpa-2019.11.001/src/ftimings/ftimings.F90	2019-12-19 09:47:42.000000000 +0000
@@ -336,14 +336,14 @@
   !> \param     bytes_per_ldst               For calculating the AI, assume this number
   !>                                         of bytes per load or store (default: 8)
   subroutine timer_set_print_options(self, &
-	print_allocated_memory, &
-	print_virtual_memory, &
-	print_max_allocated_memory, &
-	print_flop_count, &
-	print_flop_rate, &
-	print_ldst, &
+        print_allocated_memory, &
+        print_virtual_memory, &
+        print_max_allocated_memory, &
+        print_flop_count, &
+        print_flop_rate, &
+        print_ldst, &
         print_memory_bandwidth, &
-	print_ai, &
+        print_ai, &
         bytes_per_ldst)
     class(timer_t), intent(inout) :: self
     logical, intent(in), optional :: &
diff -Nru elpa-2016.05.001/src/ftimings/papi.c elpa-2019.11.001/src/ftimings/papi.c
--- elpa-2016.05.001/src/ftimings/papi.c	2016-02-26 14:11:56.000000000 +0000
+++ elpa-2019.11.001/src/ftimings/papi.c	2019-12-19 09:47:42.000000000 +0000
@@ -76,6 +76,8 @@
 			flops_available = 1;
 		}
 
+		ldst_available = 0;
+#if 0
 		/* Loads + Stores */
 		if ((ret = PAPI_query_event(PAPI_LD_INS)) < 0) {
 			fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_LD_INS): %s\n",
@@ -96,7 +98,7 @@
 		} else {
 			ldst_available = 1;
 		}
-
+#endif
 		/* Start */
 		if ((ret = PAPI_start(event_set)) < 0) {
 			fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n",
diff -Nru elpa-2016.05.001/src/general/elpa_ssmv_template.F90 elpa-2019.11.001/src/general/elpa_ssmv_template.F90
--- elpa-2016.05.001/src/general/elpa_ssmv_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/elpa_ssmv_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,87 @@
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION
+subroutine elpa_dssmv(n, alpha, a, lda, x,  y)
+#endif
+#ifdef SINGLE_PRECISION
+subroutine elpa_sssmv(n, alpha, a, lda, x,  y)
+#endif
+#endif /* REALCASE */
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION
+subroutine elpa_zssmv(n, alpha, a, lda, x,  y)
+#endif
+#ifdef SINGLE_PRECISION
+subroutine elpa_cssmv(n, alpha, a, lda, x,  y)
+#endif
+#endif /* COMPLEXCASE */
+
+  use precision
+  use elpa_utilities, only : error_unit
+  use elpa_blas_interfaces  
+  implicit none
+#include "./precision_kinds.F90"
+
+  integer(kind=BLAS_KIND)     :: n, lda
+  MATH_DATATYPE(kind=rck)     :: alpha
+  MATH_DATATYPE(kind=rck)     :: a( lda, * ), x( * ), y( * )
+  integer(kind=ik), parameter :: nb = 64 
+  integer(kind=ik)            :: ii, jj, ic, iy, jc, jx, info
+  MATH_DATATYPE(kind=rck)     :: temp
+  MATH_DATATYPE(kind=rck)     :: work( nb )
+
+  ! Test the input parameters.
+  info = 0
+  if (n == 0) then
+    return
+  end if
+  if ( n < 0 ) then
+    info = 1
+  else if ( lda < max( 1,n ) ) then
+    info = 4
+  end if
+  if ( info /= 0 ) then
+    write(error_unit,*) "wrong arguments in elpa_ssmv, info =", info
+    return
+  end if
+
+  ! Access only lower triangular part of a
+
+  temp = zero
+  do jj = 1, n, nb
+    jc = min( nb, n-jj+1 )
+    jx = 1 + (jj-1)
+    do ii = 1, n, nb
+      ic = min( nb, n-ii+1 )
+      iy = 1 + (ii-1)
+        
+      ! gemv for non-diagonal blocks. use 2x dtrmv for diagonal blocks
+      if ( ii < jj ) then
+       call PRECISION_GEMV('t', int(jc,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), -alpha, &
+                           a( jj, ii ), int(lda, kind=BLAS_KIND), &
+                           x( jx ), 1_BLAS_KIND, temp, y( iy ), 1_BLAS_KIND )
+      else if ( ii > jj ) then
+       call PRECISION_GEMV('n', int(ic,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), alpha, a( ii, jj ), &
+                           int(lda,kind=BLAS_KIND), &
+                           x( jx ), 1_BLAS_KIND, temp, y( iy ), 1_BLAS_KIND )
+      else
+        if (temp == zero) then
+          y(1:n) = zero
+        else if (temp /= one) then
+          ! should not happen
+          call PRECISION_SCAL( int(jc,kind=BLAS_KIND), temp, y( iy ), 1_BLAS_KIND)
+        end if
+        call PRECISION_COPY( int(jc,kind=BLAS_KIND), x( jx ), 1_BLAS_KIND, work, 1_BLAS_KIND )
+        call PRECISION_TRMV( 'l', 'n', 'n', int(jc,kind=BLAS_KIND), a( jj, jj ), int(lda,kind=BLAS_KIND), work, 1_BLAS_KIND )
+        call PRECISION_AXPY( int(jc,kind=BLAS_KIND),alpha, work, 1_BLAS_KIND, y( iy ), 1_BLAS_KIND)
+           
+        call PRECISION_COPY( int(jc,kind=BLAS_KIND), x( jx ), 1_BLAS_KIND, work, 1_BLAS_KIND )
+        call PRECISION_TRMV( 'l', 't', 'n', int(jc,kind=BLAS_KIND), a( jj, jj ), int(lda,kind=BLAS_KIND), work, 1_BLAS_KIND )
+        call PRECISION_AXPY(int(jc,kind=BLAS_KIND), -alpha, work, 1_BLAS_KIND, y( iy ), 1_BLAS_KIND)               
+      end if
+    end do
+    temp = one
+  end do
+
+  return
+end subroutine
+
diff -Nru elpa-2016.05.001/src/general/elpa_ssr2_template.F90 elpa-2019.11.001/src/general/elpa_ssr2_template.F90
--- elpa-2016.05.001/src/general/elpa_ssr2_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/elpa_ssr2_template.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,79 @@
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION
+subroutine elpa_dssr2(n, x, y,  a, lda )
+#endif
+#ifdef SINGLE_PRECISION
+subroutine elpa_sssr2(n, x, y,  a, lda )
+#endif
+#endif /* REALCASE */
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION
+subroutine elpa_zssr2(n, x, y,  a, lda )
+#endif
+#ifdef SINGLE_PRECISION
+subroutine elpa_cssr2(n, x, y,  a, lda )
+#endif
+#endif /* COMPLEXCASE */
+
+  use precision
+  use elpa_utilities, only : error_unit
+  use elpa_blas_interfaces
+  implicit none
+#include "./precision_kinds.F90"
+
+  integer(kind=BLAS_KIND)     :: n, lda
+  MATH_DATATYPE(kind=rck)     :: a( lda, * ), x( * ), y( * )
+  integer(kind=ik), parameter :: nb = 64
+  MATH_DATATYPE(kind=rck)     :: temp1, temp2
+  integer(kind=ik)            :: i, j, ii, jj, ic, ix, iy, jc, jx, jy, info
+  logical                     :: upper
+
+  ! test the input parameters.
+  info = 0
+  if (n == 0) then
+    return 
+  end if
+  if ( n < 0 ) then
+    info = 1
+  else if ( lda < max( 1,n ) ) then
+    info = 5
+  end if
+  if ( info /= 0 ) then
+    write(error_unit,*) "wrong arguments in elpa_ssmv, info =", info
+    return
+  end if
+
+  ! Access A in lower triangular part.
+  do jj = 1, n, nb
+    jc = min( nb, n-jj+1 )
+    jx = 1 + (jj-1)
+    jy = 1 + (jj-1)
+     
+    do j = 1, jc-1
+    ! Do local update for blocks on the diagonal
+      if ( ( x( jx + j -1) /= zero ) .or. &
+           ( y( jy + j -1 ) /= zero ) ) then
+        temp1 = - y( jy + j - 1 )
+        temp2 = - x( jy + j - 1 )
+        do i = j+1, jc
+          a( jj +  i -1 , jj +  j -1 ) = a(jj + i -1,jj +  j -1 ) + x( jx + i  -1)*temp1 - y(jj +  i -1 )*temp2
+        end do
+      end if
+    end do
+            
+    ! Use dger for other blocks
+    do ii = jj+nb, n, nb
+      ic = min( nb, n-ii+1 )
+      ix = 1 + (ii-1)
+      iy = 1 + (ii-1)
+#if REALCASE == 1
+      call PRECISION_GER(int(ic,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), -one, x( ix ), 1_BLAS_KIND, y( jy ), 1_BLAS_KIND, &
+                          a( ii, jj ), int(lda,kind=BLAS_KIND) )
+      call PRECISION_GER(int(ic,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), one, y( iy ), 1_BLAS_KIND, x( jx ), 1_BLAS_KIND, &
+                         a( ii, jj ), int(lda,kind=BLAS_KIND) )
+#endif
+    end do
+  end do
+
+  return
+end subroutine
diff -Nru elpa-2016.05.001/src/general/elpa_utilities.F90 elpa-2019.11.001/src/general/elpa_utilities.F90
--- elpa-2016.05.001/src/general/elpa_utilities.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/elpa_utilities.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,209 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+module ELPA_utilities
+
+#ifdef HAVE_ISO_FORTRAN_ENV
+  use iso_fortran_env, only : output_unit, error_unit
+#endif
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+  private ! By default, all routines contained are private
+
+  public :: output_unit, error_unit
+  public :: check_alloc, check_alloc_CUDA_f, check_memcpy_CUDA_f, check_dealloc_CUDA_f
+  public :: map_global_array_index_to_local_index
+  public :: pcol, prow
+  public :: local_index                ! Get local index of a block cyclic distributed matrix
+  public :: least_common_multiple      ! Get least common multiple
+
+#ifndef HAVE_ISO_FORTRAN_ENV
+  integer, parameter :: error_unit = 0
+  integer, parameter :: output_unit = 6
+#endif
+
+  !******
+  contains
+
+#include "prow_pcol.F90"
+
+!-------------------------------------------------------------------------------
+#include "map_global_to_local.F90"
+
+ integer function local_index(idx, my_proc, num_procs, nblk, iflag)
+
+!-------------------------------------------------------------------------------
+!  local_index: returns the local index for a given global index
+!               If the global index has no local index on the
+!               processor my_proc behaviour is defined by iflag
+!
+!  Parameters
+!
+!  idx         Global index
+!
+!  my_proc     Processor row/column for which to calculate the local index
+!
+!  num_procs   Total number of processors along row/column
+!
+!  nblk        Blocksize
+!
+!  iflag       Controls the behaviour if idx is not on local processor
+!              iflag< 0 : Return last local index before that row/col
+!              iflag==0 : Return 0
+!              iflag> 0 : Return next local index after that row/col
+!-------------------------------------------------------------------------------
+    implicit none
+
+    integer(kind=c_int) :: idx, my_proc, num_procs, nblk, iflag
+
+    integer(kind=c_int) :: iblk
+
+    iblk = (idx-1)/nblk  ! global block number, 0 based
+
+    if (mod(iblk,num_procs) == my_proc) then
+
+    ! block is local, always return local row/col number
+
+    local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1
+
+    else
+
+    ! non local block
+
+    if (iflag == 0) then
+
+        local_index = 0
+
+    else
+
+        local_index = (iblk/num_procs)*nblk
+
+        if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk
+
+        if (iflag>0) local_index = local_index + 1
+    endif
+    endif
+
+ end function local_index
+
+ integer function least_common_multiple(a, b)
+
+    ! Returns the least common multiple of a and b
+    ! There may be more efficient ways to do this, we use the most simple approach
+    implicit none
+    integer(kind=c_int), intent(in) :: a, b
+
+    do least_common_multiple = a, a*(b-1), a
+    if(mod(least_common_multiple,b)==0) exit
+    enddo
+    ! if the loop is left regularly, least_common_multiple = a*b
+
+ end function least_common_multiple
+
+ subroutine check_alloc(function_name, variable_name, istat, errorMessage)
+
+    implicit none
+
+    character(len=*), intent(in)    :: function_name
+    character(len=*), intent(in)    :: variable_name
+    integer(kind=c_int), intent(in)    :: istat
+    character(len=*), intent(in)    :: errorMessage
+
+    if (istat .ne. 0) then
+      print *, function_name, ": error when allocating ", variable_name, " ", errorMessage
+      stop 1
+    endif
+ end subroutine
+
+ subroutine check_alloc_CUDA_f(file_name, line, successCUDA)
+
+    implicit none
+
+    character(len=*), intent(in)    :: file_name
+    integer(kind=c_int), intent(in)    :: line
+    logical                         :: successCUDA
+
+    if (.not.(successCUDA)) then
+      print *, file_name, ":", line,  " error in cuda_malloc when allocating "
+      stop 1
+    endif
+ end subroutine
+
+ subroutine check_dealloc_CUDA_f(file_name, line, successCUDA)
+
+    implicit none
+
+    character(len=*), intent(in)    :: file_name
+    integer(kind=c_int), intent(in)    :: line
+    logical                         :: successCUDA
+
+    if (.not.(successCUDA)) then
+      print *, file_name, ":", line,  " error in cuda_free when deallocating "
+      stop 1
+    endif
+ end subroutine
+
+ subroutine check_memcpy_CUDA_f(file_name, line, successCUDA)
+
+    implicit none
+
+    character(len=*), intent(in)    :: file_name
+    integer(kind=c_int), intent(in)    :: line
+    logical                         :: successCUDA
+
+    if (.not.(successCUDA)) then
+      print *, file_name, ":", line,  " error in cuda_memcpy when copying "
+      stop 1
+    endif
+ end subroutine
+
+end module ELPA_utilities
diff -Nru elpa-2016.05.001/src/general/map_global_to_local.F90 elpa-2019.11.001/src/general/map_global_to_local.F90
--- elpa-2016.05.001/src/general/map_global_to_local.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/map_global_to_local.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,39 @@
+
+ function map_global_array_index_to_local_index(iGLobal, jGlobal, iLocal, jLocal , nblk, np_rows, np_cols, my_prow, my_pcol) &
+   result(possible)
+   use iso_c_binding, only : c_int
+   implicit none
+
+   integer(kind=c_int)              :: pi, pj, li, lj, xi, xj
+   integer(kind=c_int), intent(in)  :: iGlobal, jGlobal, nblk, np_rows, np_cols, my_prow, my_pcol
+   integer(kind=c_int), intent(out) :: iLocal, jLocal
+   logical                       :: possible
+
+   possible = .true.
+   iLocal = 0
+   jLocal = 0
+
+   pi = prow(iGlobal, nblk, np_rows)
+
+   if (my_prow .ne. pi) then
+     possible = .false.
+     return
+   endif
+
+   pj = pcol(jGlobal, nblk, np_cols)
+
+   if (my_pcol .ne. pj) then
+     possible = .false.
+     return
+   endif
+   li = (iGlobal-1)/(np_rows*nblk) ! block number for rows
+   lj = (jGlobal-1)/(np_cols*nblk) ! block number for columns
+
+   xi = mod( (iGlobal-1),nblk)+1   ! offset in block li
+   xj = mod( (jGlobal-1),nblk)+1   ! offset in block lj
+
+   iLocal = li * nblk + xi
+   jLocal = lj * nblk + xj
+
+ end function
+
diff -Nru elpa-2016.05.001/src/general/mod_elpa_skewsymmetric_blas.F90 elpa-2019.11.001/src/general/mod_elpa_skewsymmetric_blas.F90
--- elpa-2016.05.001/src/general/mod_elpa_skewsymmetric_blas.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/mod_elpa_skewsymmetric_blas.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,103 @@
+!    This file is part of ELPA.
+! 
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: Andreas Marek, MPCDF
+#include "config-f90.h"
+
+module elpa_skewsymmetric_blas
+  use precision
+  use iso_c_binding
+  contains
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "./precision_macros.h"
+#include "./elpa_ssr2_template.F90"
+#include "./elpa_ssmv_template.F90"
+#undef REALCASE
+#undef DOUBLE_PRECISION
+
+#if defined(WANT_SINGLE_PRECISION_REAL)
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "./precision_macros.h"
+#include "./elpa_ssr2_template.F90"
+#include "./elpa_ssmv_template.F90"
+#undef REALCASE
+#undef SINGLE_PRECISION
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "./precision_macros.h"
+#include "./elpa_ssr2_template.F90"
+#include "./elpa_ssmv_template.F90"
+#undef COMPLEXCASE
+#undef DOUBLE_PRECISION
+
+#if defined(WANT_SINGLE_PRECISION_COMPLEX)
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "./precision_macros.h"
+#include "./elpa_ssr2_template.F90"
+#include "./elpa_ssmv_template.F90"
+#undef COMPLEXCASE
+#undef SINGLE_PRECISION
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+
+end module
diff -Nru elpa-2016.05.001/src/general/precision_kinds.F90 elpa-2019.11.001/src/general/precision_kinds.F90
--- elpa-2016.05.001/src/general/precision_kinds.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/precision_kinds.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,25 @@
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION
+  integer, parameter :: rk = C_DOUBLE
+  integer, parameter :: rck = C_DOUBLE
+#endif
+#ifdef SINGLE_PRECISION
+  integer, parameter :: rk = C_FLOAT
+  integer, parameter :: rck = C_FLOAT
+#endif
+  real(kind=rck), parameter      :: ZERO=0.0_rk, ONE = 1.0_rk
+#endif
+
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION
+  integer, parameter :: rk = C_DOUBLE
+  integer, parameter :: ck = C_DOUBLE_COMPLEX
+  integer, parameter :: rck = C_DOUBLE_COMPLEX
+#endif
+#ifdef SINGLE_PRECISION
+  integer, parameter :: rk = C_FLOAT
+  integer, parameter :: ck = C_FLOAT_COMPLEX
+  integer, parameter :: rck = C_FLOAT_COMPLEX
+#endif
+  complex(kind=rck), parameter     :: ZERO = (0.0_rk,0.0_rk), ONE = (1.0_rk,0.0_rk)
+#endif
diff -Nru elpa-2016.05.001/src/general/precision_macros.h elpa-2019.11.001/src/general/precision_macros.h
--- elpa-2016.05.001/src/general/precision_macros.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/precision_macros.h	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,429 @@
+#ifdef REALCASE
+#undef DOUBLE_PRECISION_REAL
+#undef SINGLE_PRECSION_REAL
+#undef  MATH_DATATYPE
+#undef  BLAS_TRANS_OR_CONJ
+#undef  BLAS_CHAR
+#undef  BLAS_CHAR_AND_SY_OR_HE
+#undef  PRECISION
+#undef  SPECIAL_COMPLEX_DATATYPE
+#undef  PRECISION_STR
+#undef  REAL_DATATYPE
+#undef  C_REAL_DATATYPE
+
+#undef C_GEMM
+#undef C_LACPY
+#undef C_PLACPY
+#undef C_PTRAN
+
+#undef  PRECISION_TRTRI
+#undef  PRECISION_POTRF
+#undef  PRECISION_TRSM
+#undef  PRECISION_GEMV
+#undef  PRECISION_TRMV
+#undef  PRECISION_GEMM
+#undef  PRECISION_TRMM
+#undef  PRECISION_HERK
+#undef  PRECISION_SYRK
+#undef  PRECISION_SYMV
+#undef  PRECISION_SYMM
+#undef  PRECISION_HEMV
+#undef  PRECISION_HER2
+#undef  PRECISION_SYR2
+#undef  PRECISION_SYR2K
+#undef  PRECISION_GEQRF
+#undef  PRECISION_STEDC
+#undef  PRECISION_STEQR
+#undef  PRECISION_LAMRG
+#undef  PRECISION_LAMCH
+#undef  PRECISION_LAPY2
+#undef  PRECISION_LAED4
+#undef  PRECISION_LAED5
+#undef  PRECISION_NRM2
+#undef  PRECISION_LASET
+#undef  PRECISION_SCAL
+#undef  PRECISION_COPY
+#undef  PRECISION_AXPY
+#undef  PRECISION_GER
+#undef  cublas_PRECISION_GEMM
+#undef  cublas_PRECISION_TRMM
+#undef  cublas_PRECISION_GEMV
+#undef  cublas_PRECISION_SYMV
+#undef  scal_PRECISION_GEMM
+#undef  scal_PRECISION_NRM2
+#undef  scal_PRECISION_LASET
+#undef  PRECISION_SUFFIX
+#undef  ELPA_IMPL_SUFFIX
+
+#undef ELPA_PRECISION_SSMV
+#undef ELPA_PRECISION_SSR2
+
+#undef  MPI_REAL_PRECISION
+#undef  MPI_MATH_DATATYPE_PRECISION
+#undef  MPI_MATH_DATATYPE_PRECISION_C
+#undef  MPI_MATH_DATATYPE_PRECISION_EXPL
+#undef  C_DATATYPE_KIND
+#undef THRESHOLD
+
+
+/* General definitions needed in single and double case */
+#define  MATH_DATATYPE real
+#define  BLAS_TRANS_OR_CONJ 'T'
+
+#ifdef DOUBLE_PRECISION
+#define DOUBLE_PRECISION_REAL
+#define  PRECISION double
+#define  PRECISION_STR 'double'
+#define  PRECISION_SUFFIX "_double"
+#define  ELPA_IMPL_SUFFIX d
+#define  REAL_DATATYPE rk8
+#define  C_REAL_DATATYPE c_double
+#define  BLAS_CHAR D
+#define BLAS_CHAR_AND_SY_OR_HE DSY
+#define  SPECIAL_COMPLEX_DATATYPE ck8
+
+#define  PRECISION_TRTRI DTRTRI
+#define  PRECISION_POTRF DPOTRF
+#define  PRECISION_TRSM DTRSM
+#define  PRECISION_GEMV DGEMV
+#define  PRECISION_TRMV DTRMV
+#define  PRECISION_GEMM DGEMM
+#define  PRECISION_TRMM DTRMM
+#define  PRECISION_HERK DHERK
+#define  PRECISION_SYRK DSYRK
+#define  PRECISION_SYMV DSYMV
+#define  PRECISION_SYMM DSYMM
+#define  PRECISION_HEMV DHEMV
+#define  PRECISION_HER2 DHER2
+#define  PRECISION_SYR2 DSYR2
+#define  PRECISION_SYR2K DSYR2K
+#define  PRECISION_GEQRF DGEQRF
+#define  PRECISION_STEDC DSTEDC
+#define  PRECISION_STEQR DSTEQR
+#define  PRECISION_LAMRG DLAMRG
+#define  PRECISION_LAMCH DLAMCH
+#define  PRECISION_LAPY2 DLAPY2
+#define  PRECISION_LAED4 DLAED4
+#define  PRECISION_LAED5 DLAED5
+#define  PRECISION_NRM2 DNRM2
+#define  PRECISION_LASET DLASET
+#define  PRECISION_GER DGER
+#define  PRECISION_SCAL DSCAL
+#define  PRECISION_COPY DCOPY
+#define  PRECISION_AXPY DAXPY
+#define  cublas_PRECISION_GEMM cublas_DGEMM
+#define  cublas_PRECISION_TRMM cublas_DTRMM
+#define  cublas_PRECISION_GEMV cublas_DGEMV
+#define  cublas_PRECISION_SYMV cublas_DSYMV
+#define  scal_PRECISION_GEMM PDGEMM
+#define  scal_PRECISION_NRM2 PDNRM2
+#define  scal_PRECISION_LASET PDLASET
+#define  MPI_REAL_PRECISION MPI_REAL8
+#define  MPI_MATH_DATATYPE_PRECISION MPI_REAL8
+#define  MPI_MATH_DATATYPE_PRECISION_C MPI_DOUBLE
+#define  MPI_MATH_DATATYPE_PRECISION_EXPL MPI_REAL8
+#define  C_DATATYPE_KIND c_double
+
+#define ELPA_PRECISION_SSMV elpa_dssmv
+#define ELPA_PRECISION_SSR2 elpa_dssr2
+
+#define C_GEMM dgemm_
+#define C_LACPY dlacpy_
+#define C_PLACPY pdlacpy_
+#define C_PTRAN pdtran_
+
+#define THRESHOLD 1e-11_rk8
+#endif /* DOUBLE_PRECISION */
+
+#ifdef SINGLE_PRECISION
+
+#define SINGLE_PRECISION_REAL
+
+#define  PRECISION single
+#define  PRECISION_STR 'single'
+#define  PRECISION_SUFFIX "_single"
+#define  ELPA_IMPL_SUFFIX f
+#define  REAL_DATATYPE rk4
+#define  C_REAL_DATATYPE c_float
+#define  BLAS_CHAR S
+#define  BLAS_CHAR_AND_SY_OR_HE SSY
+#define  SPECIAL_COMPLEX_DATATYPE ck4
+
+#define  PRECISION_TRTRI STRTRI
+#define  PRECISION_POTRF SPOTRF
+#define  PRECISION_TRSM STRSM
+#define  PRECISION_GEMV SGEMV
+#define  PRECISION_TRMV STRMV
+#define  PRECISION_GEMM SGEMM
+#define  PRECISION_TRMM STRMM
+#define  PRECISION_HERK SHERK
+#define  PRECISION_SYRK SSYRK
+#define  PRECISION_SYMV SSYMV
+#define  PRECISION_SYMM SSYMM
+#define  PRECISION_HEMV SHEMV
+#define  PRECISION_HER2 SHER2
+#define  PRECISION_SYR2 SSYR2
+#define  PRECISION_SYR2K SSYR2K
+#define  PRECISION_GEQRF SGEQRF
+#define  PRECISION_STEDC SSTEDC
+#define  PRECISION_STEQR SSTEQR
+#define  PRECISION_LAMRG SLAMRG
+#define  PRECISION_LAMCH SLAMCH
+#define  PRECISION_LAPY2 SLAPY2
+#define  PRECISION_LAED4 SLAED4
+#define  PRECISION_LAED5 SLAED5
+#define  PRECISION_NRM2 SNRM2
+#define  PRECISION_LASET SLASET
+#define  PRECISION_GER SGER
+#define  PRECISION_SCAL SSCAL
+#define  PRECISION_COPY SCOPY
+#define  PRECISION_AXPY SAXPY
+#define  cublas_PRECISION_GEMM cublas_SGEMM
+#define  cublas_PRECISION_TRMM cublas_STRMM
+#define  cublas_PRECISION_GEMV cublas_SGEMV
+#define  cublas_PRECISION_SYMV cublas_SSYMV
+#define  scal_PRECISION_GEMM PSGEMM
+#define  scal_PRECISION_NRM2 PSNRM2
+#define  scal_PRECISION_LASET PSLASET
+#define  MPI_REAL_PRECISION MPI_REAL4
+#define  MPI_MATH_DATATYPE_PRECISION MPI_REAL4
+#define  MPI_MATH_DATATYPE_PRECISION_C MPI_FLOAT
+#define  MPI_MATH_DATATYPE_PRECISION_EXPL MPI_REAL4
+#define  C_DATATYPE_KIND c_float
+
+#define ELPA_PRECISION_SSMV elpa_sssmv
+#define ELPA_PRECISION_SSR2 elpa_sssr2
+
+#define C_GEMM sgemm_
+#define C_LACPY slacpy_
+#define C_PLACPY pslacpy_
+#define C_PTRAN pstran_
+
+#define THRESHOLD 1e-4_rk4
+#endif /* SINGLE_PRECISION */
+
+#endif /* REALCASE */
+
+#ifdef COMPLEXCASE
+
+#undef DOUBLE_PRECISION_COMPLEX
+#undef SINGLE_PRECISION_COMPLEX
+#undef  MATH_DATATYPE
+#undef  BLAS_TRANS_OR_CONJ
+#undef  BLAS_CHAR
+#undef  BLAS_CHAR_AND_SY_OR_HE
+#undef  PRECISION
+#undef COMPLEX_DATATYPE
+/* in the complex case also sometime real valued variables are needed */
+#undef REAL_DATATYPE
+#undef C_REAL_DATATYPE
+
+#undef C_GEMM
+#undef C_LACPY
+#undef C_PLACPY
+#undef C_PTRAN
+
+#undef  PRECISION_TRTRI
+#undef  PRECISION_POTRF
+#undef  PRECISION_TRSM
+#undef  PRECISION_STR
+#undef  PRECISION_GEMV
+#undef  PRECISION_TRMV
+#undef  PRECISION_GEMM
+#undef  PRECISION_TRMM
+#undef  PRECISION_HERK
+#undef  PRECISION_SYRK
+#undef  PRECISION_SYMV
+#undef  PRECISION_SYMM
+#undef  PRECISION_HEMV
+#undef  PRECISION_HER2
+#undef  PRECISION_SYR2
+#undef  PRECISION_SYR2K
+#undef  PRECISION_GEQRF
+#undef  PRECISION_STEDC
+#undef  PRECISION_STEQR
+#undef  PRECISION_LAMRG
+#undef  PRECISION_LAMCH
+#undef  PRECISION_LAPY2
+#undef  PRECISION_LAED4
+#undef  PRECISION_LAED5
+#undef  PRECISION_DOTC
+#undef  PRECISION_LASET
+#undef  PRECISION_GER
+#undef  PRECISION_SCAL
+#undef  PRECISION_COPY
+#undef  PRECISION_AXPY
+#undef  cublas_PRECISION_GEMM
+#undef  cublas_PRECISION_TRMM
+#undef  cublas_PRECISION_GEMV
+#undef  cublas_PRECISION_SYMV
+#undef  scal_PRECISION_GEMM
+#undef  scal_PRECISION_DOTC
+#undef  scal_PRECISION_LASET
+#undef  PRECISION_SUFFIX
+#undef  ELPA_IMPL_SUFFIX
+#undef  MPI_COMPLEX_PRECISION
+#undef  MPI_MATH_DATATYPE_PRECISION
+#undef  MPI_MATH_DATATYPE_PRECISION_C
+#undef  MPI_MATH_DATATYPE_PRECISION_EXPL
+#undef  MPI_COMPLEX_EXPLICIT_PRECISION
+#undef  MPI_REAL_PRECISION
+#undef  KIND_PRECISION
+#undef  PRECISION_CMPLX
+#undef  PRECISION_IMAG
+#undef  PRECISION_REAL
+#undef  C_DATATYPE_KIND
+
+#undef ELPA_PRECISION_SSMV
+#undef ELPA_PRECISION_SSR2
+
+#undef THRESHOLD
+
+/* General definitions needed in single and double case */
+#define  MATH_DATATYPE complex
+#define  BLAS_TRANS_OR_CONJ 'C'
+#ifdef DOUBLE_PRECISION
+
+#define DOUBLE_PRECISION_COMPLEX
+#define  PRECISION double
+#define  PRECISION_STR 'double'
+#define  PRECISION_SUFFIX "_double"
+#define  ELPA_IMPL_SUFFIX dc
+#define COMPLEX_DATATYPE CK8
+#define BLAS_CHAR Z
+#define BLAS_CHAR_AND_SY_OR_HE ZHE
+#define REAL_DATATYPE RK8
+#define C_REAL_DATATYPE c_double
+
+#define C_GEMM zgemm_
+#define C_LACPY zlacpy_
+#define C_PLACPY pzlacpy_
+#define C_PTRAN pztranc_
+
+#define  PRECISION_TRTRI ZTRTRI
+#define  PRECISION_POTRF ZPOTRF
+#define  PRECISION_TRSM ZTRSM
+#define  PRECISION_GEMV ZGEMV
+#define  PRECISION_TRMV ZTRMV
+#define  PRECISION_GEMM ZGEMM
+#define  PRECISION_TRMM ZTRMM
+#define  PRECISION_HERK ZHERK
+#define  PRECISION_SYRK ZSYRK
+#define  PRECISION_SYMV ZSYMV
+#define  PRECISION_SYMM ZSYMM
+#define  PRECISION_HEMV ZHEMV
+#define  PRECISION_HER2 ZHER2
+#define  PRECISION_SYR2 ZSYR2
+#define  PRECISION_SYR2K ZSYR2K
+#define  PRECISION_GEQRF ZGEQRF
+#define  PRECISION_STEDC ZSTEDC
+#define  PRECISION_STEQR ZSTEQR
+#define  PRECISION_LAMRG ZLAMRG
+#define  PRECISION_LAMCH ZLAMCH
+#define  PRECISION_LAPY2 ZLAPY2
+#define  PRECISION_LAED4 ZLAED4
+#define  PRECISION_LAED5 ZLAED5
+#define  PRECISION_DOTC ZDOTC
+#define  PRECISION_LASET ZLASET
+#define  PRECISION_GER ZGER
+#define  PRECISION_SCAL ZSCAL
+#define  PRECISION_COPY ZCOPY
+#define  PRECISION_AXPY ZAXPY
+#define  cublas_PRECISION_GEMM cublas_ZGEMM
+#define  cublas_PRECISION_TRMM cublas_ZTRMM
+#define  cublas_PRECISION_GEMV cublas_ZGEMV
+#define  cublas_PRECISION_SYMV cublas_ZSYMV
+#define  scal_PRECISION_GEMM PZGEMM
+#define  scal_PRECISION_DOTC PZDOTC
+#define  scal_PRECISION_LASET PZLASET
+#define  MPI_COMPLEX_PRECISION MPI_DOUBLE_COMPLEX
+#define  MPI_MATH_DATATYPE_PRECISION MPI_DOUBLE_COMPLEX
+#define  MPI_MATH_DATATYPE_PRECISION_C MPI_DOUBLE_COMPLEX
+#define  MPI_MATH_DATATYPE_PRECISION_EXPL MPI_COMPLEX16
+#define  MPI_COMPLEX_EXPLICIT_PRECISION MPI_COMPLEX16
+#define  MPI_REAL_PRECISION MPI_REAL8
+#define  KIND_PRECISION rk8
+#define  PRECISION_CMPLX DCMPLX
+#define  PRECISION_IMAG DIMAG
+#define  PRECISION_REAL DREAL
+#define  C_DATATYPE_KIND c_double
+
+#define ELPA_PRECISION_SSMV elpa_zssmv
+#define ELPA_PRECISION_SSR2 elpa_zssr2
+
+
+#define THRESHOLD 1e-11_rk8
+#endif /* DOUBLE PRECISION */
+
+#ifdef SINGLE_PRECISION
+#define SINGLE_PRECISION_COMPLEX
+#define  PRECISION single
+#define  PRECISION_STR 'single'
+#define  PRECISION_SUFFIX "_single"
+#define  ELPA_IMPL_SUFFIX fc
+#define COMPLEX_DATATYPE CK4
+#define BLAS_CHAR C
+#define BLAS_CHAR_AND_SY_OR_HE CHE
+#define REAL_DATATYPE RK4
+#define C_REAL_DATATYPE c_float
+
+#define C_GEMM cgemm_
+#define C_LACPY clacpy_
+#define C_PLACPY pclacpy_
+#define C_PTRAN pctranc_
+
+#define  PRECISION_TRTRI CTRTRI
+#define  PRECISION_POTRF CPOTRF
+#define  PRECISION_TRSM CTRSM
+#define  PRECISION_GEMV CGEMV
+#define  PRECISION_TRMV CTRMV
+#define  PRECISION_GEMM CGEMM
+#define  PRECISION_TRMM CTRMM
+#define  PRECISION_HERK CHERK
+#define  PRECISION_SYRK CSYRK
+#define  PRECISION_SYMV CSYMV
+#define  PRECISION_SYMM CSYMM
+#define  PRECISION_HEMV CHEMV
+#define  PRECISION_HER2 CHER2
+#define  PRECISION_SYR2 CSYR2
+#define  PRECISION_SYR2K CSYR2K
+#define  PRECISION_GEQRF CGEQRF
+#define  PRECISION_STEDC CSTEDC
+#define  PRECISION_STEQR CSTEQR
+#define  PRECISION_LAMRG CLAMRG
+#define  PRECISION_LAMCH CLAMCH
+#define  PRECISION_LAPY2 CLAPY2
+#define  PRECISION_LAED4 CLAED4
+#define  PRECISION_LAED5 CLAED5
+#define  PRECISION_DOTC CDOTC
+#define  PRECISION_LASET CLASET
+#define  PRECISION_SCAL CSCAL
+#define  PRECISION_COPY CCOPY
+#define  PRECISION_AXPY CAXPY
+#define  PRECISION_GER CGER
+#define  cublas_PRECISION_GEMM cublas_CGEMM
+#define  cublas_PRECISION_TRMM cublas_CTRMM
+#define  cublas_PRECISION_GEMV cublas_CGEMV
+#define  cublas_PRECISION_SYMV cublas_CSYMV
+#define  scal_PRECISION_GEMM PCGEMM
+#define  scal_PRECISION_DOTC PCDOTC
+#define  scal_PRECISION_LASET PCLASET
+#define  MPI_COMPLEX_PRECISION MPI_COMPLEX
+#define  MPI_MATH_DATATYPE_PRECISION MPI_COMPLEX
+#define  MPI_MATH_DATATYPE_PRECISION_C MPI_COMPLEX
+#define  MPI_MATH_DATATYPE_PRECISION_EXPL MPI_COMPLEX8
+#define  MPI_COMPLEX_EXPLICIT_PRECISION MPI_COMPLEX8
+#define  MPI_REAL_PRECISION MPI_REAL4
+#define  KIND_PRECISION rk4
+#define  PRECISION_CMPLX CMPLX
+#define  PRECISION_IMAG AIMAG
+#define  PRECISION_REAL REAL
+#define  C_DATATYPE_KIND c_float
+
+#define ELPA_PRECISION_SSMV elpa_cssmv
+#define ELPA_PRECISION_SSR2 elpa_cssr2
+
+#define THRESHOLD 1e-4_rk4
+#endif /* SINGLE PRECISION */
+
+#endif /* COMPLEXCASE */
diff -Nru elpa-2016.05.001/src/general/precision_typedefs.h elpa-2019.11.001/src/general/precision_typedefs.h
--- elpa-2016.05.001/src/general/precision_typedefs.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/precision_typedefs.h	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,23 @@
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION
+//typedef double math_type;
+#undef math_type
+#define math_type double
+#endif
+#ifdef SINGLE_PRECISION
+//typedef float math_type;
+#undef math_type
+#define math_type float
+#endif
+#endif
+
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION
+#undef math_type
+#define math_type double complex
+#endif
+#ifdef SINGLE_PRECISION
+#undef math_type
+#define math_type float complex
+#endif
+#endif
diff -Nru elpa-2016.05.001/src/general/prow_pcol.F90 elpa-2019.11.001/src/general/prow_pcol.F90
--- elpa-2016.05.001/src/general/prow_pcol.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/prow_pcol.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,18 @@
+  !Processor col for global col number
+  pure function pcol(global_col, nblk, np_cols) result(local_col)
+    use iso_c_binding, only : c_int
+    implicit none
+    integer(kind=c_int), intent(in) :: global_col, nblk, np_cols
+    integer(kind=c_int)             :: local_col
+    local_col = MOD((global_col-1)/nblk,np_cols)
+  end function
+
+  !Processor row for global row number
+  pure function prow(global_row, nblk, np_rows) result(local_row)
+    use iso_c_binding, only : c_int
+    implicit none
+    integer(kind=c_int), intent(in) :: global_row, nblk, np_rows
+    integer(kind=c_int)             :: local_row
+    local_row = MOD((global_row-1)/nblk,np_rows)
+  end function
+
diff -Nru elpa-2016.05.001/src/general/sanity.F90 elpa-2019.11.001/src/general/sanity.F90
--- elpa-2016.05.001/src/general/sanity.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/general/sanity.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,23 @@
+#ifdef REALCASE
+#ifdef COMPLEXCASE
+#error Cannot define both REALCASE and COMPLEXCASE
+#endif
+#endif
+
+#ifndef REALCASE
+#ifndef COMPLEXCASE
+#error Must define one of REALCASE or COMPLEXCASE
+#endif
+#endif
+
+#ifdef SINGLE_PRECISION
+#ifdef DOUBLE_PRECISION
+#error Cannot define both SINGLE_PRECISION and DOUBLE_PRECISION
+#endif
+#endif
+
+#ifndef SINGLE_PRECISION
+#ifndef DOUBLE_PRECISION
+#error Must define one of SINGLE_PRECISION or DOUBLE_PRECISION
+#endif
+#endif
diff -Nru elpa-2016.05.001/src/GPU/check_for_gpu.F90 elpa-2019.11.001/src/GPU/check_for_gpu.F90
--- elpa-2016.05.001/src/GPU/check_for_gpu.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/GPU/check_for_gpu.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,139 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+#include "config-f90.h"
+
+module mod_check_for_gpu
+
+  contains
+
+    function check_for_gpu(myid, numberOfDevices, wantDebug) result(gpuAvailable)
+      use cuda_functions
+      use precision
+      use elpa_mpi
+      implicit none
+
+      integer(kind=ik), intent(in)  :: myid
+      logical, optional, intent(in) :: wantDebug
+      logical                       :: success, wantDebugMessage
+      integer(kind=ik), intent(out) :: numberOfDevices
+      integer(kind=ik)              :: deviceNumber, mpierr, maxNumberOfDevices
+      logical                       :: gpuAvailable
+      !character(len=1024)           :: envname
+
+      gpuAvailable = .false.
+
+      if (cublasHandle .ne. -1) then
+        gpuAvailable = .true.
+        numberOfDevices = -1
+        if (myid == 0) then
+          print *, "Skipping GPU init, should have already been initialized "
+        endif
+        return
+      else
+        if (myid == 0) then
+          print *, "Initializing the GPU devices"
+        endif
+      endif
+
+      if (.not.(present(wantDebug))) then
+        wantDebugMessage = .false.
+      else
+        if (wantDebug) then
+          wantDebugMessage=.true.
+        else
+          wantDebugMessage=.false.
+        endif
+      endif
+
+      ! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
+      success = cuda_getdevicecount(numberOfDevices)
+
+      if (.not.(success)) then
+        print *,"error in cuda_getdevicecount"
+        stop 1
+      endif
+
+      ! make sure that all nodes have the same number of GPU's, otherwise
+      ! we run into loadbalancing trouble
+#ifdef WITH_MPI
+      call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
+
+      if (maxNumberOfDevices .ne. numberOfDevices) then
+        print *,"Different number of GPU devices on MPI tasks!"
+        print *,"GPUs will NOT be used!"
+        gpuAvailable = .false.
+        return
+      endif
+#endif
+      if (numberOfDevices .ne. 0) then
+        gpuAvailable = .true.
+        ! Usage of GPU is possible since devices have been detected
+
+        if (myid==0) then
+          if (wantDebugMessage) then
+            print *
+            print '(3(a,i0))','Found ', numberOfDevices, ' GPUs'
+          endif
+        endif
+
+        deviceNumber = mod(myid, numberOfDevices)
+        success = cuda_setdevice(deviceNumber)
+
+        if (.not.(success)) then
+          print *,"Cannot set CudaDevice"
+          stop 1
+        endif
+        if (wantDebugMessage) then
+          print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
+        endif
+        
+        success = cublas_create(cublasHandle)
+        if (.not.(success)) then
+          print *,"Cannot create cublas handle"
+          stop 1
+        endif
+        
+      endif
+
+    end function
+end module
diff -Nru elpa-2016.05.001/src/GPU/cudaFunctions.cu elpa-2019.11.001/src/GPU/cudaFunctions.cu
--- elpa-2016.05.001/src/GPU/cudaFunctions.cu	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/GPU/cudaFunctions.cu	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,443 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdio.h>
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+//
+// --------------------------------------------------------------------------------------------------
+//
+// This file was written by A. Marek, MPCDF
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <alloca.h>
+#include <stdint.h>
+#include <complex.h>
+#include <cublas_v2.h>
+
+#include "config-f90.h"
+
+#define errormessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
+
+#ifdef DEBUG_CUDA
+#define debugmessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
+#else
+#define debugmessage(x, ...)
+#endif
+
+#ifdef WITH_GPU_VERSION
+extern "C" {
+  
+    int cublasCreateFromC(intptr_t *cublas_handle) {
+//     printf("in c: %p\n", *cublas_handle);
+    *cublas_handle = (intptr_t) malloc(sizeof(cublasHandle_t));
+//     printf("in c: %p\n", *cublas_handle);
+    cublasStatus_t status = cublasCreate((cublasHandle_t*) *cublas_handle);
+    if (status == CUBLAS_STATUS_SUCCESS) {
+//       printf("all OK\n");
+      return 1;
+    }
+    else if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
+      errormessage("Error in cublasCreate: %s\n", "the CUDA Runtime initialization failed");
+      return 0;      
+    }       
+    else if (status == CUBLAS_STATUS_ALLOC_FAILED) {
+      errormessage("Error in cublasCreate: %s\n", "the resources could not be allocated");
+      return 0;
+    }
+    else{
+      errormessage("Error in cublasCreate: %s\n", "unknown error");
+      return 0;
+    }                
+  }
+
+  int cublasDestroyFromC(intptr_t *cublas_handle) {
+    cublasStatus_t status = cublasDestroy(*((cublasHandle_t*) *cublas_handle));
+    *cublas_handle = (intptr_t) NULL;
+    if (status == CUBLAS_STATUS_SUCCESS) {
+//       printf("all OK\n");
+      return 1;
+    }
+    else if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
+      errormessage("Error in cublasDestroy: %s\n", "the library has not been initialized");
+      return 0;      
+    }       
+    else{
+      errormessage("Error in cublasCreate: %s\n", "unknown error");
+      return 0;
+    }                
+  }
+
+  int cudaThreadSynchronizeFromC() {
+    // cudaThreadSynchronize is deprecated
+    // cudaDeviceSynchronize should replace it
+    // it is currently not used in ELPA anyways
+    //cudaError_t cuerr = cudaThreadSynchronize();
+    cudaError_t cuerr = cudaDeviceSynchronize();
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaDeviceSynchronize: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+
+
+  int cudaSetDeviceFromC(int n) {
+
+    cudaError_t cuerr = cudaSetDevice(n);
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaSetDevice: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+
+  int cudaGetDeviceCountFromC(int *count) {
+
+    cudaError_t cuerr = cudaGetDeviceCount(count);
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaGetDeviceCount: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+
+  int cudaDeviceSynchronizeFromC() {
+
+    cudaError_t cuerr = cudaDeviceSynchronize();
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaGetDeviceCount: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+
+
+  int cudaMallocFromC(intptr_t *a, size_t width_height) {
+
+    cudaError_t cuerr = cudaMalloc((void **) a, width_height);
+#ifdef DEBUG_CUDA
+    printf("CUDA Malloc,  pointer address: %p, size: %d \n", *a, width_height);
+#endif
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaMalloc: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+  int cudaFreeFromC(intptr_t *a) {
+#ifdef DEBUG_CUDA
+    printf("CUDA Free, pointer address: %p \n", a);
+#endif
+    cudaError_t cuerr = cudaFree(a);
+
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaFree: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+
+  int cudaMemsetFromC(intptr_t *a, int value, size_t count) {
+
+    cudaError_t cuerr = cudaMemset( a, value, count);
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaMemset: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+
+  int cudaMemcpyFromC(intptr_t *dest, intptr_t *src, size_t count, int dir) {
+
+    cudaError_t cuerr = cudaMemcpy( dest, src, count, (cudaMemcpyKind)dir);
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaMemcpy: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+
+  int cudaMemcpy2dFromC(intptr_t *dest, size_t dpitch, intptr_t *src, size_t spitch, size_t width, size_t height, int dir) {
+
+    cudaError_t cuerr = cudaMemcpy2D( dest, dpitch, src, spitch, width, height, (cudaMemcpyKind)dir);
+    if (cuerr != cudaSuccess) {
+      errormessage("Error in cudaMemcpy2d: %s\n",cudaGetErrorString(cuerr));
+      return 0;
+    }
+    return 1;
+  }
+  int cudaMemcpyDeviceToDeviceFromC(void) {
+      int val = cudaMemcpyDeviceToDevice;
+      return val;
+  }
+  int cudaMemcpyHostToDeviceFromC(void) {
+      int val = cudaMemcpyHostToDevice;
+      return val;
+  }
+  int cudaMemcpyDeviceToHostFromC(void) {
+      int val = cudaMemcpyDeviceToHost;
+      return val;
+  }
+  int cudaHostRegisterPortableFromC(void) {
+      int val = cudaHostRegisterPortable;
+      return val;
+  }
+  int cudaHostRegisterMappedFromC(void) {
+      int val = cudaHostRegisterMapped;
+      return val;
+  }
+  
+  cublasOperation_t operation_new_api(char trans) {
+    if (trans == 'N' || trans == 'n') {
+      return CUBLAS_OP_N;
+    }
+    else if (trans == 'T' || trans == 't') {
+      return CUBLAS_OP_T;
+    }
+    else if (trans == 'C' || trans == 'c') {
+      return CUBLAS_OP_C;
+    }
+    else {
+      errormessage("Error when transfering %c to cublasOperation_t\n",trans);
+      // or abort?
+      return CUBLAS_OP_N;
+    }
+  }
+
+
+  cublasFillMode_t fill_mode_new_api(char uplo) {
+    if (uplo == 'L' || uplo == 'l') {
+      return CUBLAS_FILL_MODE_LOWER;
+    }
+    else if(uplo == 'U' || uplo == 'u') {
+      return CUBLAS_FILL_MODE_UPPER;
+    }
+    else {
+      errormessage("Error when transfering %c to cublasFillMode_t\n", uplo);
+      // or abort?
+      return CUBLAS_FILL_MODE_LOWER;
+    }
+  }
+  
+  cublasSideMode_t side_mode_new_api(char side) {
+    if (side == 'L' || side == 'l') {
+      return CUBLAS_SIDE_LEFT;
+    }    
+    else if (side == 'R' || side == 'r') {
+      return CUBLAS_SIDE_RIGHT;
+    }
+    else{
+      errormessage("Error when transfering %c to cublasSideMode_t\n", side);
+      // or abort?
+      return CUBLAS_SIDE_LEFT;
+    }
+  }
+  
+  cublasDiagType_t diag_type_new_api(char diag) {
+    if (diag == 'N' || diag == 'n') {
+      return CUBLAS_DIAG_NON_UNIT;
+    }
+    else if (diag == 'U' || diag == 'u') {
+      return CUBLAS_DIAG_UNIT;
+    }
+    else {
+      errormessage("Error when transfering %c to cublasDiagMode_t\n", diag);
+      // or abort?
+      return CUBLAS_DIAG_NON_UNIT;
+    }
+  }
+      
+
+  
+  void cublasDgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, double alpha,
+                               const double *A, int lda,  const double *x, int incx,
+                               double beta, double *y, int incy) {    
+    
+    cublasDgemv(*((cublasHandle_t*)handle), operation_new_api(trans), 
+                m, n, &alpha, A, lda, x, incx, &beta, y, incy);     
+  }
+  
+  void cublasSgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, float alpha,
+                               const float *A, int lda,  const float *x, int incx,
+                               float beta, float *y, int incy) {    
+    
+    cublasSgemv(*((cublasHandle_t*)handle), operation_new_api(trans), 
+                m, n, &alpha, A, lda, x, incx, &beta, y, incy);     
+  }
+  
+  void cublasZgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, double _Complex alpha,
+                               const double _Complex *A, int lda,  const double _Complex *x, int incx,
+                               double _Complex beta, double _Complex *y, int incy) {    
+
+    cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
+    cuDoubleComplex beta_casted = *((cuDoubleComplex*)(&beta));
+    
+    const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
+    const cuDoubleComplex* x_casted = (const cuDoubleComplex*) x;
+    cuDoubleComplex* y_casted = (cuDoubleComplex*) y;
+    
+    cublasZgemv(*((cublasHandle_t*)handle), operation_new_api(trans), 
+                m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);     
+  }
+  
+  void cublasCgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, float _Complex alpha,
+                               const float _Complex *A, int lda,  const float _Complex *x, int incx,
+                               float _Complex beta, float _Complex *y, int incy) {    
+
+    cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
+    cuFloatComplex beta_casted = *((cuFloatComplex*)(&beta));
+    
+    const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
+    const cuFloatComplex* x_casted = (const cuFloatComplex*) x;
+    cuFloatComplex* y_casted = (cuFloatComplex*) y;
+    
+    cublasCgemv(*((cublasHandle_t*)handle), operation_new_api(trans), 
+                m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);     
+  }
+  
+  
+  void cublasDgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta,
+                               double *C, int ldc) {
+    
+    cublasDgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb), 
+                m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+  }
+
+  void cublasSgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
+                               float alpha, const float *A, int lda,
+                               const float *B, int ldb, float beta,
+                               float *C, int ldc) {
+    
+    cublasSgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb), 
+                m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
+  }
+
+  void cublasZgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
+                               double _Complex alpha, const double _Complex *A, int lda,
+                               const double _Complex *B, int ldb, double _Complex beta,
+                               double _Complex *C, int ldc) {
+    
+    cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
+    cuDoubleComplex beta_casted = *((cuDoubleComplex*)(&beta));
+    
+    const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
+    const cuDoubleComplex* B_casted = (const cuDoubleComplex*) B;
+    cuDoubleComplex* C_casted = (cuDoubleComplex*) C;
+    
+    cublasZgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb), 
+                m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
+  }
+
+  void cublasCgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
+                               float _Complex alpha, const float _Complex *A, int lda,
+                               const float _Complex *B, int ldb, float _Complex beta,
+                               float _Complex *C, int ldc) {
+    
+    cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
+    cuFloatComplex beta_casted = *((cuFloatComplex*)(&beta));
+    
+    const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
+    const cuFloatComplex* B_casted = (const cuFloatComplex*) B;
+    cuFloatComplex* C_casted = (cuFloatComplex*) C;
+    
+    cublasCgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb), 
+                m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
+  }
+
+  
+  // todo: new CUBLAS API diverged from standard BLAS api for these functions
+  // todo: it provides out-of-place (and apparently more efficient) implementation
+  // todo: by passing B twice (in place of C as well), we should fall back to in-place algorithm
+  
+  void cublasDtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
+                               int m, int n, double alpha, const double *A,
+                               int lda, double *B, int ldb){
+
+    cublasDtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa), 
+                diag_type_new_api(diag), m, n, &alpha, A, lda, B, ldb, B, ldb);
+  }
+
+  void cublasStrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
+                               int m, int n, float alpha, const float *A,
+                               int lda, float *B, int ldb){
+
+    cublasStrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa), 
+                diag_type_new_api(diag), m, n, &alpha, A, lda, B, ldb, B, ldb);
+  }
+
+  void cublasZtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
+                               int m, int n, double _Complex alpha, const double _Complex *A,
+                               int lda, double _Complex *B, int ldb){
+
+    cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
+    
+    const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
+    cuDoubleComplex* B_casted = (cuDoubleComplex*) B;    
+    
+    cublasZtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa), 
+                diag_type_new_api(diag), m, n, &alpha_casted, A_casted, lda, B_casted, ldb, B_casted, ldb);
+  }
+
+  void cublasCtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
+                               int m, int n, float _Complex alpha, const float _Complex *A,
+                               int lda, float _Complex *B, int ldb){
+
+    cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
+    
+    const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
+    cuFloatComplex* B_casted = (cuFloatComplex*) B;    
+    
+    cublasCtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa), 
+                diag_type_new_api(diag), m, n, &alpha_casted, A_casted, lda, B_casted, ldb, B_casted, ldb);
+  }
+
+  
+}
+#endif /* WITH_GPU_VERSION */
diff -Nru elpa-2016.05.001/src/GPU/cuUtils.cu elpa-2019.11.001/src/GPU/cuUtils.cu
--- elpa-2016.05.001/src/GPU/cuUtils.cu	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/GPU/cuUtils.cu	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,80 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+//
+// --------------------------------------------------------------------------------------------------
+//
+// This file was originally written by NVIDIA
+// and re-written by A. Marek, MPCDF
+
+#include "config-f90.h"
+
+// The real part
+#define REALCASE 1
+#undef COMPLEXCASE
+#define DOUBLE_PRECISION_REAL 1
+#include "cuUtils_template.cu"
+#undef DOUBLE_PRECISION_REAL
+
+#if WANT_SINGLE_PRECISION_REAL
+
+#undef DOUBLE_PRECISION_REAL
+#include "cuUtils_template.cu"
+
+#endif
+
+// The complex part
+#define COMPLEXCASE 1
+#undef REALCASE
+#define DOUBLE_PRECISION_COMPLEX 1
+#include "cuUtils_template.cu"
+#undef DOUBLE_PRECISION_COMPLEX
+
+#if WANT_SINGLE_PRECISION_COMPLEX
+
+#undef DOUBLE_PRECISION_COMPLEX
+#include "cuUtils_template.cu"
+
+#endif
+
diff -Nru elpa-2016.05.001/src/GPU/cuUtils_template.cu elpa-2019.11.001/src/GPU/cuUtils_template.cu
--- elpa-2016.05.001/src/GPU/cuUtils_template.cu	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/GPU/cuUtils_template.cu	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,942 @@
+#if 0
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+//
+// --------------------------------------------------------------------------------------------------
+//
+// This file was originally written by NVIDIA
+// and re-written by A. Marek, MPCDF
+#endif
+
+
+#include "config-f90.h"
+
+#include <cuda_runtime.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#if COMPLEXCASE == 1
+#include <cuComplex.h>
+#endif
+
+#define BLOCK_CYCLIC_BLOCKSIZE 128
+#define GLOBAL_STRIPE_WIDTH 256
+#define WARP_SIZE 32
+
+// Reset a reduction block
+// Limitation: the thread-block size must be a divider of the reduction block's size
+
+#if REALCASE == 1
+
+#ifdef DOUBLE_PRECISION_REAL
+__device__ void reset_shared_block_c_double ( double * s_block, int b_size)
+#else
+__device__ void reset_shared_block_c_single ( float * s_block, int b_size)
+#endif
+
+#endif
+
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__ void reset_shared_block_c_complex_double ( cuDoubleComplex * s_block, int b_size)
+#else
+__device__ void reset_shared_block_c_complex_single ( cuFloatComplex * s_block, int b_size)
+#endif
+#endif
+
+{
+    int i, t_idx, s_chunk ;
+    t_idx = threadIdx.x;
+    s_chunk = b_size / blockDim.x;
+    for(i = ((t_idx - 1) * s_chunk + 1) ; i < (t_idx * s_chunk); i++) {
+#if  REALCASE == 1
+      s_block[i] = 0.0 ;
+#endif
+#if COMPLEXCASE == 1
+      s_block[i].x = 0.0 ;
+      s_block[i].y = 0.0 ;
+#endif
+    }
+    __syncthreads();
+}
+
+// Reset 2 reduction blocks without an explicit synchronization at the end
+// Limitation: : the thread-block size must be a divider of the reduction block's size
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+__device__ void reset_shared_block_pair_c_real_double( double *s_block_1, double *s_block_2, int b_size)
+#else
+__device__ void reset_shared_block_pair_c_real_single( float *s_block_1, float *s_block_2, int b_size)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__ void reset_shared_block_pair_c_complex_double( cuDoubleComplex *s_block_1, cuDoubleComplex *s_block_2, int b_size)
+#else
+__device__ void reset_shared_block_pair_c_complex_single( cuFloatComplex *s_block_1, cuFloatComplex *s_block_2, int b_size)
+#endif
+#endif
+{
+    int i, t_idx, s_chunk;
+
+    t_idx = threadIdx.x;
+    s_chunk = b_size / blockDim.x;
+    for(i = ((t_idx - 1) * s_chunk + 1); i < (t_idx * s_chunk); i++)
+    {
+#if REALCASE == 1
+        s_block_1[i] = 0.0 ;
+        s_block_2[i] = 0.0 ;
+#endif
+#if COMPLEXCASE == 1
+        s_block_1[i].x = 0.0 ;
+        s_block_2[i].x= 0.0 ;
+        s_block_1[i].y = 0.0 ;
+        s_block_2[i].y= 0.0 ;
+#endif
+    }
+}
+// Reset a reduction block
+// Limitation: the thread-block size must be a divider of the reduction block's size
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+__device__ void warp_reduce_c_real_double( double *s_block)
+#else
+__device__ void warp_reduce_c_real_single( float *s_block)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+__device__ void warp_reduce_complex_double( cuDoubleComplex *s_block)
+#else
+__device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
+#endif
+#endif
+{
+    int t_idx ;
+    t_idx = threadIdx.x;
+    __syncthreads();
+
+#if REALCASE == 1
+        // attention
+        if (t_idx < 32)
+	{
+                s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
+        if (t_idx < 8)
+                s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 8] + s_block[t_idx + 16] + s_block[t_idx + 24];
+        if (t_idx < 4)
+                s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 4];
+        if (t_idx < 1)
+                s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3];
+	}
+#endif
+#if COMPLEXCASE == 1
+        // attention
+	if (t_idx < 32)
+        {
+#ifdef DOUBLE_PRECISION_COMPLEX
+        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 32]) , cuCadd( s_block[t_idx + 64], s_block[t_idx + 96]) );
+        if (t_idx < 8)
+        {
+        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
+
+        }
+        if (t_idx < 4)
+        {
+        s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ;
+        }
+        if (t_idx < 1)
+        {
+        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) );
+        }
+        }
+#else
+        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 32]) , cuCaddf( s_block[t_idx + 64], s_block[t_idx + 96]) );
+        if (t_idx < 8)
+        {
+        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 8] ) , cuCaddf( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
+
+        }
+        if (t_idx < 4)
+        {
+        s_block[t_idx] = cuCaddf(s_block[t_idx] , s_block[t_idx + 4]) ;
+        }
+        if (t_idx < 1)
+        {
+        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 1] ) , cuCaddf( s_block[t_idx +2] , s_block[t_idx + 3] ) );
+        }
+        }
+#endif
+#endif /* COMPLEXCASE == 1 */
+}
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+__global__ void my_pack_c_kernel_real_double(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst)
+#else
+__global__ void my_pack_c_kernel_real_single(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* src, float* dst)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void my_pack_c_kernel_complex_double(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* src, cuDoubleComplex* dst)
+#else
+__global__ void my_pack_c_kernel_complex_single(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* src, cuFloatComplex* dst)
+#endif
+#endif
+{
+    int b_id, t_id ;
+    int dst_ind ;
+    b_id = blockIdx.y;
+    t_id = threadIdx.x;
+
+    dst_ind = b_id * stripe_width + t_id;
+    if (dst_ind < max_idx)
+    {
+	// dimension of dst - lnev, nblk
+	// dimension of src - stripe_width,a_dim2,stripe_count
+#if REALCASE == 1
+    	*(dst + dst_ind + (l_nev*blockIdx.x) ) = *(src + t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2 ));
+#endif
+#if COMPLEXCASE == 1
+	dst[dst_ind + (l_nev*blockIdx.x)].x = src[t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2)].x;
+        dst[dst_ind + (l_nev*blockIdx.x)].y = src[t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2)].y;
+#endif
+     }
+
+}
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+__global__ void  my_unpack_c_kernel_real_double( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst)
+#else
+__global__ void  my_unpack_c_kernel_real_single( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* src, float* dst)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void  my_unpack_c_kernel_complex_double( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* src, cuDoubleComplex* dst)
+#else
+__global__ void  my_unpack_c_kernel_complex_single( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* src, cuFloatComplex* dst)
+#endif
+#endif
+{
+    int b_id, t_id ;
+    int src_ind;
+
+    b_id = blockIdx.y;
+    t_id = threadIdx.x;
+
+    src_ind = b_id * stripe_width + t_id;
+    if (src_ind < max_idx)
+#if REALCASE == 1
+	*(dst + (t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 ))) = *(src + src_ind  + (blockIdx.x) *l_nev );
+#endif
+#if COMPLEXCASE == 1
+        dst[ t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 )].x = src[ src_ind  + (blockIdx.x) *l_nev ].x;
+	dst[ t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 )].y = src[ src_ind  + (blockIdx.x) *l_nev ].y;
+#endif
+
+}
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+__global__ void extract_hh_tau_c_kernel_real_double(double* hh, double* hh_tau, const int nbw, const int n, int val)
+#else
+__global__ void extract_hh_tau_c_kernel_real_single(float* hh, float* hh_tau, const int nbw, const int n, int val)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void extract_hh_tau_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* hh_tau, const int nbw, const int n, int val)
+#else
+__global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* hh_tau, const int nbw, const int n, int val)
+#endif
+#endif
+{
+    int h_idx ;
+    h_idx = (blockIdx.x) * blockDim.x + threadIdx.x;
+
+    if (h_idx < n)
+    {
+	//dimension of hh - (nbw, max_blk_size)
+	//dimension of hh_tau - max_blk_size
+#if REALCASE == 1
+        *(hh_tau + h_idx ) = *(hh +  (h_idx * nbw)) ;
+#endif
+#if COMPLEXCASE == 1
+        hh_tau[h_idx] = hh[h_idx * nbw] ;
+#endif
+        //  Replace the first element in the HH reflector with 1.0 or 0.0
+#if REALCASE == 1
+	if( val == 0)
+        *(hh + (h_idx * nbw)) = 1.0;
+	else
+	*(hh + (h_idx * nbw)) = 0.0;
+#endif
+#if COMPLEXCASE == 1
+        if( val == 0)
+        {
+         hh[(h_idx * nbw)].x = 1.0;
+	 hh[h_idx *nbw].y= 0.0;
+        }
+        else
+        {
+        hh[(h_idx * nbw)].x = 0.0;
+	hh[h_idx*nbw].y =0.0;
+        }
+#endif
+     }
+}
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+__global__ void  compute_hh_dotp_c_kernel_real_double(double* hh, double* v_dot, const int nbw, const int n)
+{
+
+   __shared__ double hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
+#else
+__global__ void  compute_hh_dotp_c_kernel_real_single(float* hh, float* v_dot, const int nbw, const int n)
+{
+
+   __shared__ float hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void  compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n)
+{
+   __shared__ cuDoubleComplex hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
+
+#else
+__global__ void  compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n)
+{
+   __shared__ cuFloatComplex hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
+#endif
+#endif
+    int t_idx, v_idx;
+
+    //  The Vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
+    v_idx = blockIdx.x  ;
+
+    //  The thread index indicates the position within the two HH reflectors
+    t_idx = threadIdx.x ;
+
+//    //  The contents of the shared memory must be fully reset
+//     reset_shared_block_c(hh_s, BLOCK_CYCLIC_BLOCKSIZE);
+
+    //  Initialize the contents of the shared buffer (preparing for reduction)
+    if (t_idx  > 0)
+    {
+#if REALCASE == 1
+        *(hh_s + t_idx) = *(hh + t_idx + v_idx * nbw ) *  (*(hh + (t_idx - 1) +  (v_idx +1)* nbw)) ;
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+       hh_s[t_idx] = cuCmul(cuConj(hh[t_idx + v_idx * nbw]),   hh[ (t_idx - 1) +  (v_idx +1)* nbw]) ;
+#else
+       hh_s[t_idx] = cuCmulf(cuConjf(hh[t_idx + v_idx * nbw]),   hh[ (t_idx - 1) +  (v_idx +1)* nbw]) ;
+#endif
+#endif
+    }
+    else
+    {
+#if REALCASE == 1
+        *(hh_s + t_idx) = 0.0 ;
+#endif
+#if COMPLEXCASE == 1
+        hh_s[t_idx].x = 0.0 ;
+        hh_s[t_idx].y = 0.0;
+#endif
+    }
+     //  Compute the dot product using a fast reduction
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+     warp_reduce_c_real_double(hh_s);
+#else
+     warp_reduce_c_real_single(hh_s);
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+     warp_reduce_complex_double(hh_s);
+#else
+     warp_reduce_complex_single(hh_s);
+#endif
+     __syncthreads();
+#endif
+
+      if(t_idx == 0)
+      {
+#if REALCASE == 1
+      *(v_dot + v_idx) = *(hh_s) ;
+#endif
+#if COMPLEXCASE == 1
+      v_dot[v_idx] = hh_s[0] ;
+#endif
+      }
+
+}
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+extern "C" void launch_my_pack_c_kernel_real_double(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* a_dev, double* row_group_dev)
+#else
+extern "C" void launch_my_pack_c_kernel_real_single(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* a_dev, float* row_group_dev)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_my_pack_c_kernel_complex_double(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* a_dev, cuDoubleComplex* row_group_dev)
+#else
+extern "C" void launch_my_pack_c_kernel_complex_single(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* a_dev, cuFloatComplex* row_group_dev)
+#endif
+#endif
+{
+
+	dim3  grid_size;
+        grid_size = dim3(row_count, stripe_count, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to mypack kernel: %s, %d\n",cudaGetErrorString(err), err);
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+	my_pack_c_kernel_real_double<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev);
+#else
+	my_pack_c_kernel_real_single<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev);
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+        my_pack_c_kernel_complex_double<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev);
+#else
+        my_pack_c_kernel_complex_single<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev);
+#endif
+#endif
+	 err = cudaGetLastError();
+        if ( err!= cudaSuccess)
+        {
+                printf("\n my pack_kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+
+}
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+extern "C" void launch_compute_hh_dotp_c_kernel_real_double(double* bcast_buffer_dev, double* hh_dot_dev,const int nbw,const int n)
+#else
+extern "C" void launch_compute_hh_dotp_c_kernel_real_single(float* bcast_buffer_dev, float* hh_dot_dev,const int nbw,const int n)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* bcast_buffer_dev, cuDoubleComplex* hh_dot_dev,const int nbw,const int n)
+#else
+extern "C" void launch_compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* bcast_buffer_dev, cuFloatComplex* hh_dot_dev,const int nbw,const int n)
+#endif
+#endif
+{
+	cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to compute_hh kernel: %s, %d\n",cudaGetErrorString(err), err);
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+        compute_hh_dotp_c_kernel_real_double<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n);
+#else
+        compute_hh_dotp_c_kernel_real_single<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n);
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+        compute_hh_dotp_c_kernel_complex_double<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n);
+#else
+        compute_hh_dotp_c_kernel_complex_single<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n);
+#endif
+#endif
+	err = cudaGetLastError();
+        if ( err!= cudaSuccess)
+        {
+                printf("\n compute _kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+
+}
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+extern "C" void launch_extract_hh_tau_c_kernel_real_double(double* bcast_buffer_dev, double* hh_tau_dev, const int nbw, const int n , const int is_zero)
+#else
+extern "C" void launch_extract_hh_tau_c_kernel_real_single(float* bcast_buffer_dev, float* hh_tau_dev, const int nbw, const int n , const int is_zero)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_extract_hh_tau_c_kernel_complex_double(cuDoubleComplex* bcast_buffer_dev, cuDoubleComplex* hh_tau_dev, const int nbw, const int n , const int is_zero)
+#else
+extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bcast_buffer_dev, cuFloatComplex* hh_tau_dev, const int nbw, const int n , const int is_zero)
+#endif
+#endif
+{
+	int grid_size;
+	grid_size = 1 + (n - 1) / GLOBAL_STRIPE_WIDTH;
+	cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err);
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+	extract_hh_tau_c_kernel_real_double<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
+#else
+	extract_hh_tau_c_kernel_real_single<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+        extract_hh_tau_c_kernel_complex_double<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
+#else
+        extract_hh_tau_c_kernel_complex_single<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
+#endif
+#endif
+	err = cudaGetLastError();
+	if ( err!= cudaSuccess)
+       	{
+		printf("\n  extract _kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+
+}
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+extern "C" void launch_my_unpack_c_kernel_real_double( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, double* row_group_dev, double* a_dev)
+#else
+extern "C" void launch_my_unpack_c_kernel_real_single( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, float* row_group_dev, float* a_dev)
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_my_unpack_c_kernel_complex_double( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* row_group_dev, cuDoubleComplex* a_dev)
+#else
+extern "C" void launch_my_unpack_c_kernel_complex_single( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* row_group_dev, cuFloatComplex* a_dev)
+#endif
+#endif
+{
+        dim3  grid_size;
+	grid_size = dim3(row_count, stripe_count, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to unpack kernel: %s, %d\n",cudaGetErrorString(err), err);
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+        my_unpack_c_kernel_real_double<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev);
+#else
+        my_unpack_c_kernel_real_single<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev);
+#endif
+#endif
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+        my_unpack_c_kernel_complex_double<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev);
+#else
+        my_unpack_c_kernel_complex_single<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev);
+#endif
+#endif
+        err = cudaGetLastError();
+        if ( err != cudaSuccess)
+        {
+	    printf("\n  my_unpack_c_kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+}
+
+#if COMPLEXCASE == 1
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void compute_kernel_reduce_complex_double( cuDoubleComplex* a_dev, int lda , int n ,int nbw ,  cuDoubleComplex *h1_dev )
+#else
+__global__ void compute_kernel_reduce_complex_single( cuFloatComplex* a_dev, int lda , int n ,int nbw ,  cuFloatComplex *h1_dev )
+#endif
+{
+    int  t_id ;
+    int st_ind;
+
+    t_id = threadIdx.x;
+
+    st_ind = (t_id*(t_id+1))/2;
+    if(t_id< n)
+    {
+	for(int i =0;i<=t_id;i++)
+        {
+         h1_dev[st_ind + i] = a_dev[t_id *lda + i ] ;
+	}
+    }
+    __syncthreads();
+}
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void compute_kernel_reduce_1_complex_ouble( cuDoubleComplex* a_dev, int lda , int n, cuDoubleComplex *h1_dev )
+#else
+__global__ void compute_kernel_reduce_1_complex_ingle( cuFloatComplex* a_dev, int lda , int n, cuFloatComplex *h1_dev )
+#endif
+{
+    int  t_id ;
+    int st_ind;
+
+    t_id = threadIdx.x;
+
+    st_ind = (t_id*(t_id+1))/2;
+    if(t_id< n)
+    {
+        for(int i =0;i<=t_id;i++)
+         {
+	  a_dev[t_id *lda + i ] = h1_dev[st_ind + i];
+#ifdef DOUBLE_PRECISION_COMPLEX
+	  a_dev[ (i-1)*lda + t_id ] = cuConj(a_dev[ t_id *lda + i-1]) ;
+#else
+	  a_dev[ (i-1)*lda + t_id ] = cuConjf(a_dev[ t_id *lda + i-1]) ;
+#endif
+	}
+    }
+    __syncthreads();
+}
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void  dot_product_c_kernel_complex_double( cuDoubleComplex* hs_dev, cuDoubleComplex* hv_new_dev, cuDoubleComplex tau_new_dev, cuDoubleComplex*  x_dev, cuDoubleComplex *h_dev, cuDoubleComplex *hv_dev, int nr)
+#else
+__global__ void  dot_product_c_kernel_complex_single( cuFloatComplex* hs_dev, cuFloatComplex* hv_new_dev, cuFloatComplex tau_new_dev, cuFloatComplex*  x_dev, cuFloatComplex *h_dev, cuFloatComplex *hv_dev, int nr)
+#endif
+{
+    int t_id ;
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+    __shared__ cuDoubleComplex x_dev_temp[128];
+    __shared__ cuDoubleComplex x_val;
+#else
+    __shared__ cuFloatComplex x_dev_temp[128];
+    __shared__ cuFloatComplex x_val;
+#endif
+    //b_id = blockIdx.y;
+    t_id = threadIdx.x;
+
+    if(t_id<nr)
+#ifdef DOUBLE_PRECISION_COMPLEX
+	 x_dev_temp[t_id] = cuCmul( cuConj(hs_dev[t_id]), hv_new_dev[t_id]) ;
+#else
+	 x_dev_temp[t_id] = cuCmulf( cuConjf(hs_dev[t_id]), hv_new_dev[t_id]) ;
+#endif
+    __syncthreads();
+
+    if(t_id==0)
+    {
+        for(int i=1;i<nr;i++)
+#ifdef DOUBLE_PRECISION_COMPLEX
+	x_dev_temp[t_id] = cuCadd(x_dev_temp[t_id],x_dev_temp[t_id +i]);
+#else
+	x_dev_temp[t_id] = cuCaddf(x_dev_temp[t_id],x_dev_temp[t_id +i]);
+#endif
+    }
+    __syncthreads();
+     if(t_id ==0)
+    {
+#ifdef DOUBLE_PRECISION_COMPLEX
+      x_val =  cuCmul(x_dev_temp[t_id], tau_new_dev);
+#else
+      x_val =  cuCmulf(x_dev_temp[t_id], tau_new_dev);
+#endif
+      x_dev[0] = x_val;
+    }
+	__syncthreads();
+}
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void  dot_product_c_kernel_1_complex_double(   cuDoubleComplex*  ab_dev, cuDoubleComplex *hs_dev,  cuDoubleComplex*  hv_new_dev, cuDoubleComplex*  x_dev, cuDoubleComplex *h_dev, cuDoubleComplex *hv_dev,  int nb, int nr , int ns )
+#else
+__global__ void  dot_product_c_kernel_1_complex_single(   cuFloatComplex*  ab_dev, cuFloatComplex *hs_dev,  cuFloatComplex*  hv_new_dev, cuFloatComplex*  x_dev, cuFloatComplex *h_dev, cuFloatComplex *hv_dev,  int nb, int nr , int ns )
+#endif
+{
+    int t_id = threadIdx.x;
+        int i;
+
+    if((t_id>0 )&& (t_id < nb))
+    {
+#ifdef DOUBLE_PRECISION_COMPLEX
+	h_dev[t_id] = cuCsub(h_dev[t_id], cuCmul(x_dev[0],hv_dev[t_id]));
+#else
+	h_dev[t_id] = cuCsubf(h_dev[t_id], cuCmulf(x_dev[0],hv_dev[t_id]));
+#endif
+        for(i=0;i<nr;i++)
+	{
+#ifdef DOUBLE_PRECISION_COMPLEX
+	 ab_dev[ i+nb-t_id + (t_id+ns-1)*2*nb ] = cuCsub(cuCsub(ab_dev[ i+nb-t_id + (t_id+ns-1)*2*nb],cuCmul(hv_new_dev[i],cuConj(h_dev[t_id])) ),cuCmul(hs_dev[i], cuConj(hv_dev[t_id])));
+#else
+	 ab_dev[ i+nb-t_id + (t_id+ns-1)*2*nb ] = cuCsubf(cuCsubf(ab_dev[ i+nb-t_id + (t_id+ns-1)*2*nb],cuCmulf(hv_new_dev[i],cuConjf(h_dev[t_id])) ),cuCmulf(hs_dev[i], cuConjf(hv_dev[t_id])));
+#endif
+ 	}
+    }
+   __syncthreads();
+}
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void  double_hh_transform_kernel_complex_double( cuDoubleComplex*  ab_dev, cuDoubleComplex *hs_dev, cuDoubleComplex *hv_dev,  int nb,  int ns )
+#else
+__global__ void  double_hh_transform_kernel_complex_single( cuFloatComplex*  ab_dev, cuFloatComplex *hs_dev, cuFloatComplex *hv_dev,  int nb,  int ns )
+#endif
+{
+    int t_id = threadIdx.x;
+    if((t_id>0 )&& (t_id < nb))
+    {
+#ifdef DOUBLE_PRECISION_COMPLEX
+         ab_dev[ nb-t_id + (t_id+ns-1)*2*nb ] = cuCsub(ab_dev[ nb-t_id + (t_id+ns-1)*2*nb],cuCmul(hs_dev[0], cuConj(hv_dev[t_id])));
+#else
+         ab_dev[ nb-t_id + (t_id+ns-1)*2*nb ] = cuCsubf(ab_dev[ nb-t_id + (t_id+ns-1)*2*nb],cuCmulf(hs_dev[0], cuConjf(hv_dev[t_id])));
+#endif
+    }
+   __syncthreads();
+}
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+__global__ void  double_hh_transform_kernel_2_complex_double( cuDoubleComplex*  ab_dev, cuDoubleComplex *hd_dev, cuDoubleComplex *hv_dev,  int nc,  int ns , int nb )
+#else
+__global__ void  double_hh_transform_kernel_2_complex_single( cuFloatComplex*  ab_dev, cuFloatComplex *hd_dev, cuFloatComplex *hv_dev,  int nc,  int ns , int nb )
+#endif
+{
+    int t_id = threadIdx.x;
+    if(t_id < nc)
+    {
+#ifdef DOUBLE_PRECISION_COMPLEX
+         ab_dev[ t_id + (ns-1)*2*nb ] = cuCsub(cuCsub(ab_dev[ t_id + (ns-1)*2*nb],cuCmul(hd_dev[ t_id], cuConj(hv_dev[0]))) , cuCmul(hv_dev[ t_id], cuConj(hd_dev[0])));
+#else
+         ab_dev[ t_id + (ns-1)*2*nb ] = cuCsubf(cuCsubf(ab_dev[ t_id + (ns-1)*2*nb],cuCmulf(hd_dev[ t_id], cuConjf(hv_dev[0]))) , cuCmulf(hv_dev[ t_id], cuConjf(hd_dev[0])));
+
+#endif
+    }
+   __syncthreads();
+}
+
+#if 0  /* not used anywhere */
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_dot_product_kernel_complex_double( cuDoubleComplex* hs_dev, cuDoubleComplex* hv_new_dev, cuDoubleComplex tau_new_dev, cuDoubleComplex*  x_dev, cuDoubleComplex*  h_dev ,cuDoubleComplex*  hv_dev,int nr )
+#else
+extern "C" void launch_dot_product_kernel_complex_single( cuFloatComplex* hs_dev, cuFloatComplex* hv_new_dev, cuFloatComplex tau_new_dev, cuFloatComplex*  x_dev, cuFloatComplex*  h_dev ,cuFloatComplex*  hv_dev,int nr )
+#endif
+{
+        dim3  grid_size;
+        grid_size = dim3(1,1, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to launch_dot_product_kernel_complex: %s, %d\n",cudaGetErrorString(err), err);
+#ifdef DOUBLE_PRECISION_COMPLEX
+        dot_product_c_kernel_complex_double<<<grid_size, nr>>>(hs_dev, hv_new_dev, tau_new_dev, x_dev, h_dev, hv_dev, nr );
+#else
+        dot_product_c_kernel_complex_single<<<grid_size, nr>>>(hs_dev, hv_new_dev, tau_new_dev, x_dev, h_dev, hv_dev, nr );
+#endif
+        err = cudaGetLastError();
+        if ( err != cudaSuccess)
+        {
+            printf("\n dot product kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+}
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_dot_product_kernel_1_complex_double(  cuDoubleComplex*  ab_dev, cuDoubleComplex *hs_dev,  cuDoubleComplex*  hv_new_dev,cuDoubleComplex*  x_dev, cuDoubleComplex*  h_dev ,cuDoubleComplex*  hv_dev, int nb ,int nr , int ns)
+#else
+extern "C" void launch_dot_product_kernel_1_complex_single(  cuFloatComplex*  ab_dev, cuFloatComplex *hs_dev,  cuFloatComplex*  hv_new_dev,cuFloatComplex*  x_dev, cuFloatComplex*  h_dev ,cuFloatComplex*  hv_dev, int nb ,int nr , int ns)
+#endif
+{
+        dim3  grid_size;
+        grid_size = dim3(1,1, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to launch_dot_product_kernel_1_complex: %s, %d\n",cudaGetErrorString(err), err);
+#ifdef DOUBLE_PRECISION_COMPLEX
+        dot_product_c_kernel_1_complex_double<<<grid_size, nb>>>( ab_dev, hs_dev, hv_new_dev, x_dev, h_dev, hv_dev, nb, nr, ns );
+#else
+        dot_product_c_kernel_1_complex_single<<<grid_size, nb>>>( ab_dev, hs_dev, hv_new_dev, x_dev, h_dev, hv_dev, nb, nr, ns );
+#endif
+        err = cudaGetLastError();
+        if ( err != cudaSuccess)
+        {
+            printf("\n dot product kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+}
+
+#endif /* not used anywhere */
+
+#if 0  /* not used anywhere */
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_dot_product_kernel_2_complex_double(  cuDoubleComplex*  ab_dev, cuDoubleComplex *hs_dev,  cuDoubleComplex*  hv_dev,cuDoubleComplex*  hd_dev, int nb ,int nr , int ne)
+#else
+extern "C" void launch_dot_product_kernel_2_complex_single(  cuFloatComplex*  ab_dev, cuFloatComplex *hs_dev,  cuFloatComplex*  hv_dev,cuFloatComplex*  hd_dev, int nb ,int nr , int ne)
+#endif
+{
+        dim3  grid_size;
+        grid_size = dim3(1,1, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to launch_dot_product_kernel_2_complex: %s, %d\n",cudaGetErrorString(err), err);
+        err = cudaGetLastError();
+        if ( err != cudaSuccess)
+        {
+            printf("\n dot product kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+}
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_double_hh_transform_1_complex_double( cuDoubleComplex*  ab_dev, cuDoubleComplex *hs_dev,cuDoubleComplex*  hv_dev, int nb , int ns)
+#else
+extern "C" void launch_double_hh_transform_1_complex_single( cuFloatComplex*  ab_dev, cuFloatComplex *hs_dev,cuFloatComplex*  hv_dev, int nb , int ns)
+#endif
+{
+        dim3  grid_size;
+        grid_size = dim3(1,1, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to launch_double_hh_transform_1: %s, %d\n",cudaGetErrorString(err), err);
+#ifdef DOUBLE_PRECISION_COMPLEX
+        double_hh_transform_kernel_complex_double<<<grid_size, nb>>>( ab_dev, hs_dev, hv_dev, nb,  ns );
+#else
+        double_hh_transform_kernel_complex_single<<<grid_size, nb>>>( ab_dev, hs_dev, hv_dev, nb,  ns );
+#endif
+        err = cudaGetLastError();
+        if ( err != cudaSuccess)
+        {
+            printf("\n dot product kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+}
+#endif /* not used anywhere */
+
+#if 0  /* not used anywhere */
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_double_hh_transform_2_complex_double( cuDoubleComplex*  ab_dev, cuDoubleComplex *hd_dev,cuDoubleComplex*  hv_dev, int nc , int ns , int nb )
+#else
+extern "C" void launch_double_hh_transform_2_complex_single( cuFloatComplex*  ab_dev, cuFloatComplex *hd_dev,cuFloatComplex*  hv_dev, int nc , int ns , int nb )
+#endif
+{
+        dim3  grid_size;
+        grid_size = dim3(1,1, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to launch_double_hh_transform_2: %s, %d\n",cudaGetErrorString(err), err);
+#ifdef DOUBLE_PRECISION_COMPLEX
+        double_hh_transform_kernel_2_complex_double<<<grid_size, nc>>>( ab_dev, hd_dev, hv_dev, nc,  ns, nb );
+#else
+        double_hh_transform_kernel_2_complex_single<<<grid_size, nc>>>( ab_dev, hd_dev, hv_dev, nc,  ns, nb );
+#endif
+        err = cudaGetLastError();
+        if ( err != cudaSuccess)
+        {
+            printf("\n dot product kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+}
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_compute_kernel_reduce_complex_double( cuDoubleComplex* a_dev, int lda, int n,int nbw, cuDoubleComplex* h_dev)
+#else
+extern "C" void launch_compute_kernel_reduce_complex_single( cuFloatComplex* a_dev, int lda, int n,int nbw, cuFloatComplex* h_dev)
+#endif
+{
+        dim3  grid_size;
+        grid_size = dim3(1,1, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to launch_compute_kernel_reduce : %s, %d\n",cudaGetErrorString(err), err);
+#ifdef DOUBLE_PRECISION_COMPLEX
+        compute_kernel_reduce_complex_double<<<grid_size,n>>>(a_dev, lda, n, nbw,h_dev);
+#else
+        compute_kernel_reduce_complex_single<<<grid_size,n>>>(a_dev, lda, n, nbw,h_dev);
+#endif
+	cudaDeviceSynchronize();
+        err = cudaGetLastError();
+        if ( err != cudaSuccess)
+        {
+            printf("\n dot product kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+}
+#endif /* not used anywhere */
+
+#if 0 /* not used anywhere */
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+extern "C" void launch_compute_kernel_reduce_1_complex_double( cuDoubleComplex* a_dev, int lda, int n , cuDoubleComplex* h_dev)
+#else
+extern "C" void launch_compute_kernel_reduce_1_complex_single( cuFloatComplex* a_dev, int lda, int n , cuFloatComplex* h_dev)
+#endif
+{
+        dim3  grid_size;
+        grid_size = dim3(1,1, 1);
+        cudaDeviceSynchronize();
+        cudaError_t err = cudaGetLastError();
+        if(err != cudaSuccess) printf("error prior to launch_compute_kernel_reduce_1: %s, %d\n",cudaGetErrorString(err), err);
+#ifdef DOUBLE_PRECISION_COMPLEX
+        compute_kernel_reduce_1_complex_double<<<grid_size,n>>>(a_dev, lda, n, h_dev);
+#else
+        compute_kernel_reduce_1_complex_single<<<grid_size,n>>>(a_dev, lda, n, h_dev);
+#endif
+	cudaDeviceSynchronize();
+        err = cudaGetLastError();
+        if ( err != cudaSuccess)
+        {
+            printf("\n dot product kernel failed  %s \n",cudaGetErrorString(err) );
+        }
+
+}
+#endif  /* not used anywhere */
+
+#endif /* COMPLEXCASE == 1 */
+
+#ifndef MEMCPY_ALREADY_DEFINED
+extern "C" int cuda_MemcpyDeviceToDevice(int val)
+{
+      val = cudaMemcpyDeviceToDevice;
+      return val;
+}
+#define MEMCPY_ALREADY_DEFINED 1
+#endif
diff -Nru elpa-2016.05.001/src/GPU/mod_cuda.F90 elpa-2019.11.001/src/GPU/mod_cuda.F90
--- elpa-2016.05.001/src/GPU/mod_cuda.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/GPU/mod_cuda.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,969 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+
+#include "config-f90.h"
+module cuda_functions
+  use iso_c_binding
+  use precision
+  implicit none
+
+  public
+
+  integer(kind=ik) :: cudaMemcpyHostToDevice
+  integer(kind=ik) :: cudaMemcpyDeviceToHost
+  integer(kind=ik) :: cudaHostRegisterPortable
+  integer(kind=ik) :: cudaHostRegisterMapped
+  integer(kind=ik) :: cudaMemcpyDeviceToDevice
+
+  ! TODO global variable, has to be changed
+  integer(kind=C_intptr_T) :: cublasHandle = -1
+
+  integer(kind=c_intptr_t), parameter :: size_of_double_real    = 8_rk8
+#ifdef WANT_SINGLE_PRECISION_REAL
+  integer(kind=c_intptr_t), parameter :: size_of_single_real    = 4_rk4
+#endif
+
+  integer(kind=c_intptr_t), parameter :: size_of_double_complex = 16_ck8
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+  integer(kind=c_intptr_t), parameter :: size_of_single_complex = 8_ck4
+#endif
+
+  ! functions to set and query the CUDA devices
+  interface
+    function cublas_create_c(handle) result(istat) &
+             bind(C, name="cublasCreateFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=C_intptr_T) :: handle
+      integer(kind=C_INT)  :: istat
+    end function cublas_create_c
+  end interface  
+
+  interface
+    function cublas_destroy_c(handle) result(istat) &
+             bind(C, name="cublasDestroyFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=C_intptr_T) :: handle
+      integer(kind=C_INT)  :: istat
+    end function cublas_destroy_c
+  end interface  
+
+  interface
+    function cuda_threadsynchronize_c() result(istat) &
+             bind(C,name="cudaThreadSynchronizeFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=C_INT)  :: istat
+    end function cuda_threadsynchronize_c
+  end interface
+
+  interface
+    function cuda_setdevice_c(n) result(istat) &
+             bind(C, name="cudaSetDeviceFromC")
+
+      use iso_c_binding
+      implicit none
+      integer(kind=C_INT), value    :: n
+      integer(kind=C_INT)           :: istat
+    end function cuda_setdevice_c
+  end interface
+
+  interface
+    function cuda_getdevicecount_c(n) result(istat) &
+             bind(C, name="cudaGetDeviceCountFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=C_INT), intent(out) :: n
+      integer(kind=C_INT)              :: istat
+    end function cuda_getdevicecount_c
+  end interface
+
+  interface
+    function cuda_devicesynchronize_c()result(istat) &
+             bind(C,name='cudaDeviceSynchronizeFromC')
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=C_INT)                       :: istat
+
+    end function cuda_devicesynchronize_c
+  end interface
+
+
+  ! functions to copy CUDA memory
+  interface
+    function cuda_memcpyDeviceToDevice_c() result(flag) &
+             bind(C, name="cudaMemcpyDeviceToDeviceFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=c_int) :: flag
+    end function
+  end interface
+
+  interface
+    function cuda_memcpyHostToDevice_c() result(flag) &
+             bind(C, name="cudaMemcpyHostToDeviceFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=c_int) :: flag
+    end function
+  end interface
+
+  interface
+    function cuda_memcpyDeviceToHost_c() result(flag) &
+             bind(C, name="cudaMemcpyDeviceToHostFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=c_int) :: flag
+    end function
+  end interface
+
+  interface
+    function cuda_hostRegisterPortable_c() result(flag) &
+             bind(C, name="cudaHostRegisterPortableFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=c_int) :: flag
+    end function
+  end interface
+
+  interface
+    function cuda_hostRegisterMapped_c() result(flag) &
+             bind(C, name="cudaHostRegisterMappedFromC")
+      use iso_c_binding
+      implicit none
+      integer(kind=c_int) :: flag
+    end function
+  end interface
+
+  interface
+    function cuda_memcpy_c(dst, src, size, dir) result(istat) &
+             bind(C, name="cudaMemcpyFromC")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=C_intptr_t), value              :: dst
+      integer(kind=C_intptr_t), value              :: src
+      integer(kind=c_intptr_t), intent(in), value    :: size
+      integer(kind=C_INT), intent(in), value       :: dir
+      integer(kind=C_INT)                          :: istat
+
+    end function cuda_memcpy_c
+  end interface
+
+  interface
+    function cuda_memcpy2d_c(dst, dpitch, src, spitch, width, height , dir) result(istat) &
+             bind(C, name="cudaMemcpy2dFromC")
+
+      use iso_c_binding
+
+      implicit none
+
+      integer(kind=C_intptr_T), value                :: dst
+      integer(kind=c_intptr_t), intent(in), value    :: dpitch
+      integer(kind=C_intptr_T), value                :: src
+      integer(kind=c_intptr_t), intent(in), value    :: spitch
+      integer(kind=c_intptr_t), intent(in), value    :: width
+      integer(kind=c_intptr_t), intent(in), value    :: height
+      integer(kind=C_INT), intent(in), value         :: dir
+      integer(kind=C_INT)                            :: istat
+
+    end function cuda_memcpy2d_c
+  end interface
+
+  ! functions to allocate and free CUDA memory
+
+  interface
+    function cuda_free_c(a) result(istat) &
+             bind(C, name="cudaFreeFromC")
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=C_intptr_T), value  :: a
+      integer(kind=C_INT)              :: istat
+
+    end function cuda_free_c
+  end interface
+
+  interface
+    function cuda_malloc_c(a, width_height) result(istat) &
+             bind(C, name="cudaMallocFromC")
+
+      use iso_c_binding
+      implicit none
+
+      integer(kind=C_intptr_T)                    :: a
+      integer(kind=c_intptr_t), intent(in), value   :: width_height
+      integer(kind=C_INT)                         :: istat
+
+    end function cuda_malloc_c
+  end interface
+
+  interface
+    function cuda_memset_c(a, val, size) result(istat) &
+             bind(C, name="cudaMemsetFromC")
+
+      use iso_c_binding
+
+      implicit none
+
+      integer(kind=C_intptr_T), value            :: a
+      integer(kind=C_INT), value                 :: val
+      integer(kind=c_intptr_t), intent(in), value  :: size
+      integer(kind=C_INT)                        :: istat
+
+    end function cuda_memset_c
+  end interface
+
+  ! cuBLAS
+  interface
+    subroutine cublas_dgemm_c(handle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc) &
+                              bind(C,name='cublasDgemm_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value               :: cta, ctb
+      integer(kind=C_INT),value               :: m,n,k
+      integer(kind=C_INT), intent(in), value  :: lda,ldb,ldc
+      real(kind=C_DOUBLE),value               :: alpha,beta
+      integer(kind=C_intptr_T), value         :: a, b, c
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_dgemm_c
+  end interface
+
+  interface
+    subroutine cublas_sgemm_c(handle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc) &
+                              bind(C,name='cublasSgemm_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value               :: cta, ctb
+      integer(kind=C_INT),value               :: m,n,k
+      integer(kind=C_INT), intent(in), value  :: lda,ldb,ldc
+      real(kind=C_FLOAT),value                :: alpha,beta
+      integer(kind=C_intptr_T), value         :: a, b, c
+      integer(kind=C_intptr_T), value         :: handle
+      
+    end subroutine cublas_sgemm_c
+  end interface
+
+  interface
+    subroutine cublas_dtrmm_c(handle, side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb) &
+                              bind(C,name='cublasDtrmm_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value               :: side, uplo, trans, diag
+      integer(kind=C_INT),value               :: m,n
+      integer(kind=C_INT), intent(in), value  :: lda,ldb
+      real(kind=C_DOUBLE), value              :: alpha
+      integer(kind=C_intptr_T), value         :: a, b
+      integer(kind=C_intptr_T), value         :: handle
+      
+    end subroutine cublas_dtrmm_c
+  end interface
+
+  interface
+    subroutine cublas_strmm_c(handle, side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb) &
+                              bind(C,name='cublasStrmm_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value               :: side, uplo, trans, diag
+      integer(kind=C_INT),value               :: m,n
+      integer(kind=C_INT), intent(in), value  :: lda,ldb
+      real(kind=C_FLOAT), value               :: alpha
+      integer(kind=C_intptr_T), value         :: a, b
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_strmm_c
+  end interface
+
+  interface
+    subroutine cublas_zgemm_c(handle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc) &
+                              bind(C,name='cublasZgemm_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value              :: cta, ctb
+      integer(kind=C_INT),value              :: m,n,k
+      integer(kind=C_INT), intent(in), value :: lda,ldb,ldc
+      complex(kind=C_DOUBLE_COMPLEX),value           :: alpha,beta
+      integer(kind=C_intptr_T), value        :: a, b, c
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_zgemm_c
+  end interface
+
+  interface
+    subroutine cublas_cgemm_c(handle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc) &
+                              bind(C,name='cublasCgemm_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value              :: cta, ctb
+      integer(kind=C_INT),value              :: m,n,k
+      integer(kind=C_INT), intent(in), value :: lda,ldb,ldc
+      complex(kind=C_FLOAT_COMPLEX),value            :: alpha,beta
+      integer(kind=C_intptr_T), value        :: a, b, c
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_cgemm_c
+  end interface
+
+  interface
+    subroutine cublas_ztrmm_c(handle, side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb) &
+                              bind(C,name='cublasZtrmm_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value              :: side, uplo, trans, diag
+      integer(kind=C_INT),value              :: m,n
+      integer(kind=C_INT), intent(in), value :: lda,ldb
+      complex(kind=C_DOUBLE_COMPLEX), value          :: alpha
+      integer(kind=C_intptr_T), value        :: a, b
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_ztrmm_c
+  end interface
+
+  interface
+    subroutine cublas_ctrmm_c(handle, side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb) &
+                              bind(C,name='cublasCtrmm_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value              :: side, uplo, trans, diag
+      integer(kind=C_INT),value              :: m,n
+      integer(kind=C_INT), intent(in), value :: lda,ldb
+      complex(kind=C_FLOAT_COMPLEX), value           :: alpha
+      integer(kind=C_intptr_T), value        :: a, b
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_ctrmm_c
+  end interface
+
+  interface
+    subroutine cublas_dgemv_c(handle, cta, m, n, alpha, a, lda, x, incx, beta, y, incy) &
+                              bind(C,name='cublasDgemv_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value               :: cta
+      integer(kind=C_INT),value               :: m,n
+      integer(kind=C_INT), intent(in), value  :: lda,incx,incy
+      real(kind=C_DOUBLE),value               :: alpha,beta
+      integer(kind=C_intptr_T), value         :: a, x, y
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_dgemv_c
+  end interface
+
+  interface
+    subroutine cublas_sgemv_c(handle, cta, m, n, alpha, a, lda, x, incx, beta, y, incy) &
+                              bind(C,name='cublasSgemv_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value               :: cta
+      integer(kind=C_INT),value               :: m,n
+      integer(kind=C_INT), intent(in), value  :: lda,incx,incy
+      real(kind=C_FLOAT),value                :: alpha,beta
+      integer(kind=C_intptr_T), value         :: a, x, y
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_sgemv_c
+  end interface
+
+  interface
+    subroutine cublas_zgemv_c(handle, cta, m, n, alpha, a, lda, x, incx, beta, y, incy) &
+                              bind(C,name='cublasZgemv_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value               :: cta
+      integer(kind=C_INT),value               :: m,n
+      integer(kind=C_INT), intent(in), value  :: lda,incx,incy
+      complex(kind=C_DOUBLE_COMPLEX),value               :: alpha,beta
+      integer(kind=C_intptr_T), value         :: a, x, y
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_zgemv_c
+  end interface
+
+  interface
+    subroutine cublas_cgemv_c(handle, cta, m, n, alpha, a, lda, x, incx, beta, y, incy) &
+                              bind(C,name='cublasCgemv_elpa_wrapper')
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value               :: cta
+      integer(kind=C_INT),value               :: m,n
+      integer(kind=C_INT), intent(in), value  :: lda,incx,incy
+      complex(kind=C_FLOAT_COMPLEX),value                :: alpha,beta
+      integer(kind=C_intptr_T), value         :: a, x, y
+      integer(kind=C_intptr_T), value         :: handle
+
+    end subroutine cublas_cgemv_c
+  end interface
+
+
+#ifdef WITH_NVTX
+  ! NVTX profiling interfaces
+  interface nvtxRangePushA
+    subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA')
+      use iso_c_binding
+      character(kind=C_CHAR,len=1) :: name(*)
+    end subroutine
+  end interface
+
+  interface nvtxRangePop
+    subroutine nvtxRangePop() bind(C, name='nvtxRangePop')
+    end subroutine
+  end interface
+#endif
+
+  contains
+
+#ifdef WITH_NVTX
+   ! this wrapper is needed for the string conversion
+   subroutine nvtxRangePush(range_name)
+     implicit none
+     character(len=*), intent(in) :: range_name
+
+     character(kind=C_CHAR,len=1), dimension(len(range_name)+1) :: c_name
+     integer i
+
+     do i = 1, len(range_name)
+       c_name(i) = range_name(i:i)
+     end do
+     c_name(len(range_name)+1) = char(0)
+
+     call nvtxRangePushA(c_name)
+   end subroutine
+#endif
+
+    ! functions to set and query the CUDA devices
+
+   function cublas_create(handle) result(success)
+     use iso_c_binding
+     implicit none
+
+     integer(kind=C_intptr_t)                  :: handle
+     logical                                   :: success
+#ifdef WITH_GPU_VERSION
+     success = cublas_create_c(handle) /= 0
+#else
+     success = .true.
+#endif
+   end function
+
+   function cublas_destroy(handle) result(success)
+     use iso_c_binding
+     implicit none
+
+     integer(kind=C_intptr_t)                  :: handle
+     logical                                   :: success
+#ifdef WITH_GPU_VERSION
+     success = cublas_destroy_c(handle) /= 0
+#else
+     success = .true.
+#endif
+   end function
+    
+    function cuda_threadsynchronize() result(success)
+      use iso_c_binding
+
+      implicit none
+
+      logical :: success
+#ifdef WITH_GPU_VERSION
+      success = cuda_threadsynchronize_c() /= 0
+#else
+      success = .true.
+#endif
+    end function cuda_threadsynchronize
+
+    function cuda_setdevice(n) result(success)
+      use iso_c_binding
+
+      implicit none
+
+      integer(kind=ik), intent(in)  :: n
+      logical                       :: success
+#ifdef WITH_GPU_VERSION
+      success = cuda_setdevice_c(int(n,kind=c_int)) /= 0
+#else
+      success = .true.
+#endif
+    end function cuda_setdevice
+
+    function cuda_getdevicecount(n) result(success)
+      use iso_c_binding
+      implicit none
+
+      integer(kind=ik)     :: n
+      integer(kind=c_int)  :: nCasted
+      logical              :: success
+#ifdef WITH_GPU_VERSION
+      success = cuda_getdevicecount_c(nCasted) /=0
+      n = int(nCasted)
+#else
+      success = .true.
+      n = 0
+#endif
+    end function cuda_getdevicecount
+
+    function cuda_devicesynchronize()result(success)
+
+      use iso_c_binding
+
+      implicit none
+      logical :: success
+#ifdef WITH_GPU_VERSION
+      success = cuda_devicesynchronize_c() /=0
+#else
+      success = .true.
+#endif
+    end function cuda_devicesynchronize
+    ! functions to allocate and free memory
+
+    function cuda_malloc(a, width_height) result(success)
+
+     use iso_c_binding
+     implicit none
+
+     integer(kind=C_intptr_t)                  :: a
+     integer(kind=c_intptr_t), intent(in)        :: width_height
+     logical                                   :: success
+#ifdef WITH_GPU_VERSION
+     success = cuda_malloc_c(a, width_height) /= 0
+#else
+     success = .true.
+#endif
+   end function
+
+   function cuda_free(a) result(success)
+
+     use iso_c_binding
+
+     implicit none
+     integer(kind=C_intptr_T) :: a
+     logical                  :: success
+#ifdef WITH_GPU_VERSION
+     success = cuda_free_c(a) /= 0
+#else
+     success = .true.
+#endif
+   end function cuda_free
+
+ function cuda_memset(a, val, size) result(success)
+
+   use iso_c_binding
+
+   implicit none
+
+   integer(kind=c_intptr_t)                :: a
+   integer(kind=ik)                        :: val
+   integer(kind=c_intptr_t), intent(in)      :: size
+   integer(kind=C_INT)                     :: istat
+
+   logical :: success
+#ifdef WITH_GPU_VERSION
+   success= cuda_memset_c(a, int(val,kind=c_int), int(size,kind=c_intptr_t)) /=0
+#else
+   success = .true.
+#endif
+ end function cuda_memset
+
+ ! functions to memcopy CUDA memory
+
+ function cuda_memcpyDeviceToDevice() result(flag)
+   use iso_c_binding
+   implicit none
+   integer(kind=ik) :: flag
+#ifdef WITH_GPU_VERSION
+   flag = int(cuda_memcpyDeviceToDevice_c())
+#else
+   flag = 0
+#endif
+ end function
+
+ function cuda_memcpyHostToDevice() result(flag)
+   use iso_c_binding
+   use precision
+   implicit none
+   integer(kind=ik) :: flag
+#ifdef WITH_GPU_VERSION
+   flag = int(cuda_memcpyHostToDevice_c())
+#else
+   flag = 0
+#endif
+ end function
+
+ function cuda_memcpyDeviceToHost() result(flag)
+   use iso_c_binding
+   use precision
+   implicit none
+   integer(kind=ik) :: flag
+#ifdef WITH_GPU_VERSION
+   flag = int( cuda_memcpyDeviceToHost_c())
+#else
+   flag = 0
+#endif
+ end function
+
+ function cuda_hostRegisterPortable() result(flag)
+   use iso_c_binding
+   use precision
+   implicit none
+   integer(kind=ik) :: flag
+#ifdef WITH_GPU_VERSION
+   flag = int(cuda_hostRegisterPortable_c())
+#else
+   flag = 0
+#endif
+ end function
+
+ function cuda_hostRegisterMapped() result(flag)
+   use iso_c_binding
+   use precision
+   implicit none
+   integer(kind=ik) :: flag
+#ifdef WITH_GPU_VERSION
+   flag = int(cuda_hostRegisterMapped_c())
+#else
+   flag = 0
+#endif
+ end function
+
+ function cuda_memcpy(dst, src, size, dir) result(success)
+
+      use iso_c_binding
+
+      implicit none
+      integer(kind=C_intptr_t)              :: dst
+      integer(kind=C_intptr_t)              :: src
+      integer(kind=c_intptr_t), intent(in)    :: size
+      integer(kind=C_INT), intent(in)       :: dir
+      logical :: success
+
+#ifdef WITH_GPU_VERSION
+        success = cuda_memcpy_c(dst, src, size, dir) /= 0
+#else
+        success = .true.
+#endif
+    end function
+
+    function cuda_memcpy2d(dst, dpitch, src, spitch, width, height , dir) result(success)
+
+      use iso_c_binding
+
+      implicit none
+
+      integer(kind=C_intptr_T)           :: dst
+      integer(kind=c_intptr_t), intent(in) :: dpitch
+      integer(kind=C_intptr_T)           :: src
+      integer(kind=c_intptr_t), intent(in) :: spitch
+      integer(kind=c_intptr_t), intent(in) :: width
+      integer(kind=c_intptr_t), intent(in) :: height
+      integer(kind=C_INT), intent(in)    :: dir
+      logical                            :: success
+#ifdef WITH_GPU_VERSION
+      success = cuda_memcpy2d_c(dst, dpitch, src, spitch, width, height , dir) /= 0
+#else
+      success = .true.
+#endif
+    end function cuda_memcpy2d
+
+    ! cuBLAS
+    subroutine cublas_dgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: cta, ctb
+      integer(kind=C_INT)             :: m,n,k
+      integer(kind=C_INT), intent(in) :: lda,ldb,ldc
+      real(kind=C_DOUBLE)             :: alpha,beta
+      integer(kind=C_intptr_T)        :: a, b, c
+#ifdef WITH_GPU_VERSION
+      call cublas_dgemm_c(cublasHandle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+#endif
+    end subroutine cublas_dgemm
+
+    subroutine cublas_sgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: cta, ctb
+      integer(kind=C_INT)             :: m,n,k
+      integer(kind=C_INT), intent(in) :: lda,ldb,ldc
+      real(kind=C_FLOAT)              :: alpha,beta
+      integer(kind=C_intptr_T)        :: a, b, c
+#ifdef WITH_GPU_VERSION
+      call cublas_sgemm_c(cublasHandle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+#endif
+    end subroutine cublas_sgemm
+
+    subroutine cublas_dtrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: side, uplo, trans, diag
+      integer(kind=C_INT)             :: m,n
+      integer(kind=C_INT), intent(in) :: lda,ldb
+      real(kind=C_DOUBLE)             :: alpha
+      integer(kind=C_intptr_T)        :: a, b
+#ifdef WITH_GPU_VERSION
+      call cublas_dtrmm_c(cublasHandle, side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
+#endif
+    end subroutine cublas_dtrmm
+
+    subroutine cublas_strmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: side, uplo, trans, diag
+      integer(kind=C_INT)             :: m,n
+      integer(kind=C_INT), intent(in) :: lda,ldb
+      real(kind=C_FLOAT)              :: alpha
+      integer(kind=C_intptr_T)        :: a, b
+#ifdef WITH_GPU_VERSION
+      call cublas_strmm_c(cublasHandle, side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
+#endif
+    end subroutine cublas_strmm
+
+    subroutine cublas_zgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: cta, ctb
+      integer(kind=C_INT)             :: m,n,k
+      integer(kind=C_INT), intent(in) :: lda,ldb,ldc
+      complex(kind=C_DOUBLE_COMPLEX)          :: alpha,beta
+      integer(kind=C_intptr_T)        :: a, b, c
+#ifdef WITH_GPU_VERSION
+      call cublas_zgemm_c(cublasHandle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
+#endif
+    end subroutine cublas_zgemm
+
+    subroutine cublas_cgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: cta, ctb
+      integer(kind=C_INT)             :: m,n,k
+      integer(kind=C_INT), intent(in) :: lda,ldb,ldc
+      complex(kind=C_FLOAT_COMPLEX)           :: alpha,beta
+      integer(kind=C_intptr_T)        :: a, b, c
+#ifdef WITH_GPU_VERSION
+      call cublas_cgemm_c(cublasHandle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
+#endif
+    end subroutine cublas_cgemm
+
+    subroutine cublas_ztrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: side, uplo, trans, diag
+      integer(kind=C_INT)             :: m,n
+      integer(kind=C_INT), intent(in) :: lda,ldb
+      complex(kind=C_DOUBLE_COMPLEX)          :: alpha
+      integer(kind=C_intptr_T)        :: a, b
+#ifdef WITH_GPU_VERSION
+      call cublas_ztrmm_c(cublasHandle, side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
+#endif
+    end subroutine cublas_ztrmm
+
+    subroutine cublas_ctrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
+
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: side, uplo, trans, diag
+      integer(kind=C_INT)             :: m,n
+      integer(kind=C_INT), intent(in) :: lda,ldb
+      complex(kind=C_FLOAT_COMPLEX)           :: alpha
+      integer(kind=C_intptr_T)        :: a, b
+#ifdef WITH_GPU_VERSION
+      call cublas_ctrmm_c(cublasHandle, side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
+#endif
+    end subroutine cublas_ctrmm
+
+    subroutine cublas_dgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: cta
+      integer(kind=C_INT)             :: m,n
+      integer(kind=C_INT), intent(in) :: lda,incx,incy
+      real(kind=C_DOUBLE)             :: alpha,beta
+      integer(kind=C_intptr_T)        :: a, x, y
+#ifdef WITH_GPU_VERSION
+      call cublas_dgemv_c(cublasHandle, cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
+#endif
+    end subroutine cublas_dgemv
+
+    subroutine cublas_sgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: cta
+      integer(kind=C_INT)             :: m,n
+      integer(kind=C_INT), intent(in) :: lda,incx,incy
+      real(kind=C_FLOAT)              :: alpha,beta
+      integer(kind=C_intptr_T)        :: a, x, y
+#ifdef WITH_GPU_VERSION
+      call cublas_sgemv_c(cublasHandle, cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
+#endif
+    end subroutine cublas_sgemv
+
+    subroutine cublas_zgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: cta
+      integer(kind=C_INT)             :: m,n
+      integer(kind=C_INT), intent(in) :: lda,incx,incy
+      complex(kind=C_DOUBLE_COMPLEX)             :: alpha,beta
+      integer(kind=C_intptr_T)        :: a, x, y
+#ifdef WITH_GPU_VERSION
+      call cublas_zgemv_c(cublasHandle, cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
+#endif
+    end subroutine cublas_zgemv
+
+    subroutine cublas_cgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
+      use iso_c_binding
+
+      implicit none
+      character(1,C_CHAR),value       :: cta
+      integer(kind=C_INT)             :: m,n
+      integer(kind=C_INT), intent(in) :: lda,incx,incy
+      complex(kind=C_FLOAT_COMPLEX)              :: alpha,beta
+      integer(kind=C_intptr_T)        :: a, x, y
+#ifdef WITH_GPU_VERSION
+      call cublas_cgemv_c(cublasHandle, cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
+#endif
+    end subroutine cublas_cgemv
+
+
+!     subroutine cublas_dsymv(cta, n, alpha, a, lda, x, incx, beta, y, incy)
+!       use iso_c_binding
+! 
+!       implicit none
+!       character(1,C_CHAR),value       :: cta
+!       integer(kind=C_INT)             :: n
+!       integer(kind=C_INT), intent(in) :: lda,incx,incy
+!       real(kind=C_DOUBLE)             :: alpha,beta
+!       integer(kind=C_intptr_T)        :: a, x, y
+! #ifdef WITH_GPU_VERSION
+!       call cublas_dsymv_c(cta, n, alpha, a, lda, x, incx, beta, y, incy)
+! #endif
+!     end subroutine cublas_dsymv
+! 
+!     subroutine cublas_ssymv(cta, n, alpha, a, lda, x, incx, beta, y, incy)
+!       use iso_c_binding
+! 
+!       implicit none
+!       character(1,C_CHAR),value       :: cta
+!       integer(kind=C_INT)             :: n
+!       integer(kind=C_INT), intent(in) :: lda,incx,incy
+!       real(kind=C_FLOAT)              :: alpha,beta
+!       integer(kind=C_intptr_T)        :: a, x, y
+! #ifdef WITH_GPU_VERSION
+!       call cublas_ssymv_c(cta, n, alpha, a, lda, x, incx, beta, y, incy)
+! #endif
+!     end subroutine cublas_ssymv
+! 
+!     subroutine cublas_zsymv(cta, n, alpha, a, lda, x, incx, beta, y, incy)
+!       use iso_c_binding
+! 
+!       implicit none
+!       character(1,C_CHAR),value       :: cta
+!       integer(kind=C_INT)             :: n
+!       integer(kind=C_INT), intent(in) :: lda,incx,incy
+!       complex(kind=C_DOUBLE_COMPLEX)             :: alpha,beta
+!       integer(kind=C_intptr_T)        :: a, x, y
+! #ifdef WITH_GPU_VERSION
+! !       call cublas_zsymv_c(cta, n, alpha, a, lda, x, incx, beta, y, incy)
+! #endif
+!     end subroutine cublas_zsymv
+! 
+!     subroutine cublas_csymv(cta, n, alpha, a, lda, x, incx, beta, y, incy)
+!       use iso_c_binding
+! 
+!       implicit none
+!       character(1,C_CHAR),value       :: cta
+!       integer(kind=C_INT)             :: n
+!       integer(kind=C_INT), intent(in) :: lda,incx,incy
+!       complex(kind=C_FLOAT_COMPLEX)              :: alpha,beta
+!       integer(kind=C_intptr_T)        :: a, x, y
+! #ifdef WITH_GPU_VERSION
+! !       call cublas_csymv_c(cta, n, alpha, a, lda, x, incx, beta, y, incy)
+! #endif
+!     end subroutine cublas_csymv
+
+
+end module cuda_functions
diff -Nru elpa-2016.05.001/src/helpers/aligned_mem.F90 elpa-2019.11.001/src/helpers/aligned_mem.F90
--- elpa-2016.05.001/src/helpers/aligned_mem.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/aligned_mem.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,63 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: Lorenz Huedepohl, MPCDF
+
+module aligned_mem
+  use, intrinsic :: iso_c_binding
+
+  interface
+    function posix_memalign(memptr, alignment, size) result(error) bind(C, name="posix_memalign")
+      import c_int, c_intptr_t, c_ptr
+      integer(kind=c_int) :: error
+      type(c_ptr), intent(inout) :: memptr
+      integer(kind=c_intptr_t), intent(in), value :: alignment, size
+    end function
+  end interface
+
+  interface
+    subroutine free(ptr) bind(C, name="free")
+      import c_ptr
+      type(c_ptr), value :: ptr
+    end subroutine
+  end interface
+
+end module
diff -Nru elpa-2016.05.001/src/helpers/fortran_blas_interfaces.F90 elpa-2019.11.001/src/helpers/fortran_blas_interfaces.F90
--- elpa-2016.05.001/src/helpers/fortran_blas_interfaces.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/fortran_blas_interfaces.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,1029 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+  interface
+    subroutine dger(M, N, ALPHA, X, INCX, Y, INCY, A, LDA)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)       :: M, N, INCX, INCY, LDA
+    real(kind=rk8), intent(in)    :: ALPHA, X(*), Y(*)
+    real(kind=rk8), intent(inout) :: A(LDA, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine daxpy(N, DA, DX, INCX, DY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)       :: N, INCX, INCY
+    real(kind=rk8), intent(in)    :: DA, DX(*)
+    real(kind=rk8), intent(inout) :: DY(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dcopy(N, DX, INCX, DY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)       :: N, INCX, INCY
+    real(kind=rk8), intent(in)    :: DX(*)
+    real(kind=rk8), intent(inout) :: DY(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dscal(N, DA, DX, INCX)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)       :: N, INCX
+    real(kind=rk8)                :: DA
+    real(kind=rk8), intent(inout) :: DX(*)
+    end subroutine
+
+  end interface
+
+  interface
+    subroutine dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANSA, TRANSB
+    integer(kind=BLAS_KIND) :: M, N, K, LDA, LDB, LDC
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: A(LDA, *), B(LDB, *), C(LDC, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dtrtri(UPLO, DIAG, N, A, LDA, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, DIAG
+    integer(kind=BLAS_KIND) :: N, LDA
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk8)          :: a(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dpotrf(UPLO, N, A, LDA, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, LDA
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk8)          :: a(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dtrsm(SIDE, UPLO, TRANSA, DIAG, M,N, ALPHA, A, LDA, B, LDB)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO, TRANSA, DIAG
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB
+    real(kind=rk8)          :: ALPHA
+    real(kind=rk8)          :: a(lda, *), b(ldb, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dgemv(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANS
+    integer(kind=BLAS_KIND) :: M, N, LDA, INCX, INCY
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dtrmv(UPLO, TRANS, DIAG, N, A, LDA, X, INCX)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS, DIAG
+    integer(kind=BLAS_KIND) :: N, LDA, INCX
+    real(kind=rk8)          :: a(lda, *), x(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dtrmm(SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, B, LDB)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO, TRANSA, DIAG
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB
+    real(kind=rk8)          :: ALPHA
+    real(kind=rk8)          :: a(lda, *), b(ldb, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dsyrk(UPLO, TRANS, N, K, ALPHA, A, LDA, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS
+    integer(kind=BLAS_KIND) :: N, K, LDA, LDC
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: a(lda, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dsymv(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, LDA, INCX, INCY
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dsymm(SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB, LDC
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: a(lda, *), b(ldb, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dsyr2(UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, INCX, INCY, LDA
+    real(kind=rk8)          :: ALPHA
+    real(kind=rk8)          :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dsyr2k(UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS
+    integer(kind=BLAS_KIND) :: N, K, LDA, LDB, LDC
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: a(lda, *), b(ldb, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dgeqrf(M, N, A, LDA, TAU, WORK, LWORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: M, N, LDA, LWORK
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk8)          :: a(lda, *), TAU(*), WORK(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dstedc(COMPZ, N, D, E, Z, LDZ, WORK, LWORK, IWORK, LIWORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: COMPZ
+    integer(kind=BLAS_KIND) :: N, LDZ, LWORK, IWORK(*), LIWORK
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk8)          :: D(*), E(*), z(ldz, *), work(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dsteqr(COMPZ, N, D, E, Z, LDZ, WORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: COMPZ
+    integer(kind=BLAS_KIND) :: N, LDZ
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk8)          :: D(*), E(*), z(ldz, *), work(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine dlamrg(N1, N2, A, DTRD1, DTRD2, INDEX)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND), intent(in) :: N1, N2, DTRD1, DTRD2
+    integer(kind=BLAS_KIND), intent(inout) :: INDEX(*)
+    real(kind=rk8), intent(in)          :: A(*)
+    end subroutine
+  end interface
+
+  interface
+    function dlamch(CMACG) result(DMACH)
+    use PRECISION_MODULE
+    implicit none
+    character               :: CMACG
+    real(kind=rk8)          :: DMACH
+    end function
+  end interface
+
+  interface
+    function dlapy2(X, Y) result(sqrt_x2_y2)
+    use PRECISION_MODULE
+    implicit none
+    real(kind=rk8)          :: x, y, sqrt_x2_y2
+
+    end function
+  end interface
+
+  interface
+    subroutine dlaed4(N, I, D, Z, DELTA, RHO, DLAM, INFO)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: N, I
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk8)          :: D(*), Z(*), DELTA(*), RHO, DLAM
+    end subroutine
+  end interface
+
+  interface
+    subroutine dlaed5(I, D, Z, DELTA, RHO, DLAM)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: I
+    real(kind=rk8)          :: D(2), Z(2), DELTA(2), RHO, DLAM
+    end subroutine
+  end interface
+
+  interface
+    function dnrm2(N,X, INCX) result(nrm2)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: N, INCX
+    real(kind=rk8)          :: x(*), nrm2
+
+    end function
+  end interface
+
+  interface
+    subroutine dlaset(UPLO, M, N, ALPHA, BETA, A, LDA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: M, N, LDA
+    real(kind=rk8)          :: ALPHA, BETA, A(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    function dlange(NORM, M, N, A, LDA, WORK) result(norm2)
+    use PRECISION_MODULE
+    implicit none
+    character               :: NORM
+    integer(kind=BLAS_KIND) :: M, N, LDA
+    real(kind=rk8)          :: A(lda, *)
+    real(kind=rk8), intent(inout) :: work(*)
+    real(kind=rk8)          :: norm2
+    end function
+  end interface
+
+!#endif /* DOUBLE_PRECISION_REAL */
+  interface
+    subroutine sger(M, N, ALPHA, X, INCX, Y, INCY, A, LDA)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)       :: M, N, INCX, INCY, LDA
+    real(kind=rk4), intent(in)    :: ALPHA, X(*), Y(*)
+    real(kind=rk4), intent(inout) :: A(LDA, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine saxpy(N, DA, DX, INCX, DY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)       :: N, INCX, INCY
+    real(kind=rk4), intent(in)    :: DA, DX(*)
+    real(kind=rk4), intent(inout) :: DY(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine scopy(N, DX, INCX, DY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)       :: N, INCX, INCY
+    real(kind=rk4), intent(in)    :: DX(*)
+    real(kind=rk4), intent(inout) :: DY(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine sscal(N, DA, DX, INCX)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)       :: N, INCX
+    real(kind=rk4)                :: DA
+    real(kind=rk4), intent(inout) :: DX(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine sgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANSA, TRANSB
+    integer(kind=BLAS_KIND) :: M, N, K, LDA, LDB, LDC
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: A(LDA, *), B(LDB, *), C(LDC, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine strtri(UPLO, DIAG, N, A, LDA, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, DIAG
+    integer(kind=BLAS_KIND) :: N, LDA
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk4)          :: a(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine spotrf(UPLO, N, A, LDA, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, LDA
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk4)          :: a(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine strsm(SIDE, UPLO, TRANSA, DIAG, M,N, ALPHA, A, LDA, B, LDB)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO, TRANSA, DIAG
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB
+    real(kind=rk4)          :: ALPHA
+    real(kind=rk4)          :: a(lda, *), b(ldb, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine sgemv(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANS
+    integer(kind=BLAS_KIND) :: M, N, LDA, INCX, INCY
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine strmv(UPLO, TRANS, DIAG, N, A, LDA, X, INCX)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS, DIAG
+    integer(kind=BLAS_KIND) :: N, LDA, INCX
+    real(kind=rk4)          :: a(lda, *), x(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine strmm(SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, B, LDB)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO, TRANSA, DIAG
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB
+    real(kind=rk4)          :: ALPHA
+    real(kind=rk4)          :: a(lda, *), b(ldb, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ssyrk(UPLO, TRANS, N, K, ALPHA, A, LDA, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS
+    integer(kind=BLAS_KIND) :: N, K, LDA, LDC
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: a(lda, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ssymv(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, LDA, INCX, INCY
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ssymm(SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB, LDC
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: a(lda, *), b(ldb, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ssyr2(UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, INCX, INCY, LDA
+    real(kind=rk4)          :: ALPHA
+    real(kind=rk4)          :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ssyr2k(UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS
+    integer(kind=BLAS_KIND) :: N, K, LDA, LDB, LDC
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: a(lda, *), b(ldb, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine sgeqrf(M, N, A, LDA, TAU, WORK, LWORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: M, N, LDA, LWORK
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk4)          :: a(lda, *), TAU(*), WORK(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine sstedc(COMPZ, N, D, E, Z, LDZ, WORK, LWORK, IWORK, LIWORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: COMPZ
+    integer(kind=BLAS_KIND) :: N, LDZ, LWORK, IWORK(*), LIWORK
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk4)          :: D(*), E(*), z(ldz, *), work(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ssteqr(COMPZ, N, D, E, Z, LDZ, WORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: COMPZ
+    integer(kind=BLAS_KIND) :: N, LDZ
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk4)          :: D(*), E(*), z(ldz, *), work(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine slamrg(N1, N2, A, DTRD1, DTRD2, INDEX)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND), intent(in) :: N1, N2, DTRD1, DTRD2
+    integer(kind=BLAS_KIND), intent(inout) :: INDEX(*)
+    real(kind=rk4), intent(in)          :: A(*)
+    end subroutine
+  end interface
+
+  interface
+    function slamch(CMACG) result(DMACH)
+    use PRECISION_MODULE
+    implicit none
+    character               :: CMACG
+    real(kind=rk4)          :: DMACH
+    end function
+  end interface
+
+  interface
+    function slapy2(X, Y) result(sqrt_x2_y2)
+    use PRECISION_MODULE
+    implicit none
+    real(kind=rk4)          :: x, y, sqrt_x2_y2
+
+    end function
+  end interface
+
+  interface
+    subroutine slaed4(N, I, D, Z, DELTA, RHO, DLAM, INFO)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: N, I
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk4)          :: D(*), Z(*), DELTA(*), RHO, DLAM
+    end subroutine
+  end interface
+
+  interface
+    subroutine slaed5(I, D, Z, DELTA, RHO, DLAM)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: I
+    real(kind=rk4)          :: D(2), Z(2), DELTA(2), RHO, DLAM
+    end subroutine
+  end interface
+
+
+  interface
+    function snrm2(N,X, INCX) result(nrm2)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: N, INCX
+    real(kind=rk4)          :: x(*), nrm2
+
+    end function
+  end interface
+
+  interface
+    subroutine slaset(UPLO, M, N, ALPHA, BETA, A, LDA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: M, N, LDA
+    real(kind=rk4)          :: ALPHA, BETA, A(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    function slange(NORM, M, N, A, LDA, WORK) result(norm2)
+    use PRECISION_MODULE
+    implicit none
+    character               :: NORM
+    integer(kind=BLAS_KIND) :: M, N, LDA
+    real(kind=rk4)          :: A(lda, *)
+    real(kind=rk4), intent(inout) :: work(*)
+    real(kind=rk4)          :: norm2
+    end function
+  end interface
+
+
+!#endif /* SINGLE_PRECSION_REAL */
+  interface
+   complex*16 function zdotc(N, ZX, INCX, ZY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)          :: N, INCX, INCY
+    complex(kind=ck8), intent(in)    :: ZX(*), ZY(*)
+    end function
+  end interface
+
+  interface
+    subroutine zaxpy(N, DA, DX, INCX, DY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)          :: N, INCX, INCY
+    complex(kind=ck8), intent(in)    :: DA, DX(*)
+    complex(kind=ck8), intent(inout) :: DY(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine zcopy(N, DX, INCX, DY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)          :: N, INCX, INCY
+    complex(kind=ck8), intent(in)    :: DX(*)
+    complex(kind=ck8), intent(inout) :: DY(*)
+    end subroutine
+  end interface
+
+
+  interface
+    subroutine zscal(N, DA, DX, INCX)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)          :: N, INCX
+    complex(kind=ck8)                :: DA
+    complex(kind=ck8), intent(inout) :: DX(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANSA, TRANSB
+    integer(kind=BLAS_KIND) :: M, N, K, LDA, LDB, LDC
+    complex(kind=ck8)       :: ALPHA, BETA
+    complex(kind=ck8)       :: A(LDA, *), B(LDB, *), C(LDC, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ztrtri(UPLO, DIAG, N, A, LDA, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, DIAG
+    integer(kind=BLAS_KIND) :: N, LDA
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    complex(kind=ck8)          :: a(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine zpotrf(UPLO, N, A, LDA, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, LDA
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    complex(kind=ck8)          :: a(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ztrsm(SIDE, UPLO, TRANSA, DIAG, M,N, ALPHA, A, LDA, B, LDB)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO, TRANSA, DIAG
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB
+    complex(kind=ck8)       :: ALPHA
+    complex(kind=ck8)       :: a(lda, *), b(ldb, *)
+    end subroutine
+  end interface
+
+ interface
+    subroutine zgemv(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANS
+    integer(kind=BLAS_KIND) :: M, N, LDA, INCX, INCY
+    complex(kind=ck8)       :: ALPHA, BETA
+    complex(kind=ck8)       :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ztrmv(UPLO, TRANS, DIAG, N, A, LDA, X, INCX)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS, DIAG
+    integer(kind=BLAS_KIND) :: N, LDA, INCX
+    complex(kind=ck8)       :: a(lda, *), x(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ztrmm(SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, B, LDB)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO, TRANSA, DIAG
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB
+    complex(kind=ck8)       :: ALPHA
+    complex(kind=ck8)       :: a(lda, *), b(ldb, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine zherk(UPLO, TRANS, N, K, ALPHA, A, LDA, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS
+    integer(kind=BLAS_KIND) :: N, K, LDA, LDC
+    complex(kind=ck8)       :: ALPHA, BETA
+    complex(kind=ck8)       :: a(lda, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine zhemv(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, LDA, INCX, INCY
+    complex(kind=ck8)       :: ALPHA, BETA
+    complex(kind=ck8)       :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine zsymm(SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB, LDC
+    complex(kind=ck8)       :: ALPHA, BETA
+    complex(kind=ck8)       :: a(lda, *), b(ldb, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine zher2(UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, INCX, INCY, LDA
+    complex(kind=ck8)       :: ALPHA
+    complex(kind=ck8)       :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine zgeqrf(M, N, A, LDA, TAU, WORK, LWORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: M, N, LDA, LWORK
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    complex(kind=ck8)       :: a(lda, *), TAU(*), WORK(*)
+    end subroutine
+  end interface
+
+#if 0
+  ! not used
+  interface
+    subroutine zstedc(COMPZ, N, D, E, Z, LDZ, WORK, LWORK, RWORK, LRWORK, IWORK, LIWORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: COMPZ
+    integer(kind=BLAS_KIND) :: N, LDZ, LWORK, LRWORK, IWORK(*), LIWORK
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk8)          :: D(*), E(*), RWORK(*)
+    complex(kind=ck8)       :: z(ldz, *), work(*)
+    end subroutine
+  end interface
+#endif
+
+  interface
+    subroutine zlaset(UPLO, M, N, ALPHA, BETA, A, LDA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: M, N, LDA
+    complex(kind=ck8)       :: ALPHA, BETA, A(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    function zlange(NORM, M, N, A, LDA, WORK) result(norm2)
+    use PRECISION_MODULE
+    implicit none
+    character                        :: NORM
+    integer(kind=BLAS_KIND)          :: M, N, LDA
+    complex(kind=ck8)                :: A(lda, *)
+    real(kind=rk8), intent(inout)    :: work(*)
+    real(kind=rk8)                   :: norm2
+    end function
+  end interface
+
+
+
+!#endif /* DOUBLE_PRECISION_COMPLEX */
+  interface
+   complex*8 function cdotc(N, ZX, INCX, ZY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)          :: N, INCX, INCY
+    complex(kind=ck4), intent(in)    :: ZX(*), ZY(*)
+    end function
+  end interface
+
+  interface
+    subroutine caxpy(N, DA, DX, INCX, DY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)          :: N, INCX, INCY
+    complex(kind=ck4), intent(in)    :: DA, DX(*)
+    complex(kind=ck4), intent(inout) :: DY(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ccopy(N, DX, INCX, DY, INCY)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)          :: N, INCX, INCY
+    complex(kind=ck4), intent(in)    :: DX(*)
+    complex(kind=ck4), intent(inout) :: DY(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine cscal(N, DA, DX, INCX)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND)          :: N, INCX
+    complex(kind=ck4)                :: DA
+    complex(kind=ck4), intent(inout) :: DX(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine cgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANSA, TRANSB
+    integer(kind=BLAS_KIND) :: M, N, K, LDA, LDB, LDC
+    complex(kind=ck4)       :: ALPHA, BETA
+    complex(kind=ck4)       :: A(LDA, *), B(LDB, *), C(LDC, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ctrtri(UPLO, DIAG, N, A, LDA, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, DIAG
+    integer(kind=BLAS_KIND) :: N, LDA
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    complex(kind=ck4)          :: a(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine cpotrf(UPLO, N, A, LDA, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, LDA
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    complex(kind=ck4)       :: a(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ctrsm(SIDE, UPLO, TRANSA, DIAG, M,N, ALPHA, A, LDA, B, LDB)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO, TRANSA, DIAG
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB
+    complex(kind=ck4)       :: ALPHA
+    complex(kind=ck4)       :: a(lda, *), b(ldb, *)
+    end subroutine
+  end interface
+
+ interface
+    subroutine cgemv(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANS
+    integer(kind=BLAS_KIND) :: M, N, LDA, INCX, INCY
+    complex(kind=ck4)       :: ALPHA, BETA
+    complex(kind=ck4)       :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ctrmv(UPLO, TRANS, DIAG, N, A, LDA, X, INCX)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS, DIAG
+    integer(kind=BLAS_KIND) :: N, LDA, INCX
+    complex(kind=ck4)       :: a(lda, *), x(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine ctrmm(SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, B, LDB)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO, TRANSA, DIAG
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB
+    complex(kind=ck4)       :: ALPHA
+    complex(kind=ck4)       :: a(lda, *), b(ldb, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine cherk(UPLO, TRANS, N, K, ALPHA, A, LDA, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO, TRANS
+    integer(kind=BLAS_KIND) :: N, K, LDA, LDC
+    complex(kind=ck4)       :: ALPHA, BETA
+    complex(kind=ck4)       :: a(lda, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine chemv(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, LDA, INCX, INCY
+    complex(kind=ck4)       :: ALPHA, BETA
+    complex(kind=ck4)       :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine csymm(SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: SIDE, UPLO
+    integer(kind=BLAS_KIND) :: M, N, LDA, LDB, LDC
+    complex(kind=ck4)       :: ALPHA, BETA
+    complex(kind=ck4)       :: a(lda, *), b(ldb, *), c(ldc, *)
+    end subroutine
+  end interface
+
+  interface
+    subroutine cher2(UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: N, INCX, INCY, LDA
+    complex(kind=ck4)       :: ALPHA
+    complex(kind=ck4)       :: a(lda, *), x(*), y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine cgeqrf(M, N, A, LDA, TAU, WORK, LWORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: M, N, LDA, LWORK
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    complex(kind=ck4)       :: a(lda, *), TAU(*), WORK(*)
+    end subroutine
+  end interface
+
+#if 0
+  ! not used
+  interface
+    subroutine cstedc(COMPZ, N, D, E, Z, LDZ, WORK, LWORK, RWORK, LRWORK, IWORK, LIWORK, INFO)
+    use PRECISION_MODULE
+    implicit none
+    character               :: COMPZ
+    integer(kind=BLAS_KIND) :: N, LDZ, LWORK, LRWORK, IWORK(*), LIWORK
+    integer(kind=BLAS_KIND), intent(inout) :: INFO
+    real(kind=rk4)          :: D(*), E(*), RWORK(*)
+    complex(kind=ck4)       :: z(ldz, *), work(*)
+    end subroutine
+  end interface
+#endif
+
+
+  interface
+    subroutine claset(UPLO, M, N, ALPHA, BETA, A, LDA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: M, N, LDA
+    complex(kind=ck4)       :: ALPHA, BETA, A(lda, *)
+    end subroutine
+  end interface
+
+  interface
+    function clange(NORM, M, N, A, LDA, WORK) result(norm2)
+    use PRECISION_MODULE
+    implicit none
+    character                     :: NORM
+    integer(kind=BLAS_KIND)       :: M, N, LDA
+    complex(kind=ck4)             :: A(lda, *)
+    real(kind=rk4), intent(inout) :: work(*)
+    real(kind=rk4)                :: norm2
+    end function
+  end interface
+
+
diff -Nru elpa-2016.05.001/src/helpers/fortran_scalapack_interfaces.F90 elpa-2019.11.001/src/helpers/fortran_scalapack_interfaces.F90
--- elpa-2016.05.001/src/helpers/fortran_scalapack_interfaces.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/fortran_scalapack_interfaces.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,305 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+
+  interface 
+    subroutine descinit(DESC, M, N, MB, NB, IRSRC, ICSRC, ICTXT, LLD, INFO)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND) :: DESC(*), M, N, MB, NB, IRSRC, ICSRC, LLD
+    integer(kind=BLAS_KIND), intent(inout) :: info
+    integer(kind=BLAS_KIND) :: ICTXT
+    end subroutine
+  end interface
+
+  interface
+    subroutine blacs_gridinit(ICONTXT, ORDER, NPROW, NPCOL)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND) :: ICONTXT
+    character(len=1)        :: ORDER
+    integer(kind=BLAS_KIND) :: NPROW, NPCOL
+    end subroutine
+  end interface
+
+  interface
+    subroutine blacs_gridexit(ICONTXT)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND) :: ICONTXT
+    end subroutine
+  end interface
+
+  interface
+    subroutine blacs_gridinfo(ICONTXT, NPROW, NPCOL, MYPROW, MYPCOL)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND) :: ICONTXT
+    integer(kind=BLAS_KIND), intent(inout) :: NPROW, NPCOL, MYPROW, MYPCOL
+    end subroutine
+  end interface
+
+
+  interface
+    function numroc(N, NB, IPROC, ISRCPROC, NPROCS) result(numr)
+    use precision
+    implicit none
+    integer(kind=BLAS_KIND) :: N, NB, IPROC, ISRCPROC, NPROCS, numr
+    end function
+  end interface
+
+
+  interface
+    subroutine pdgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANSA, TRANSB
+    integer(kind=BLAS_KIND) :: M, N, K, IA, JA, DESCA(*), IB, JB, DESCB(*), IC, JC, DESCC(*)
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: A(*), B(*), C(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pdnrm2(N, norm2, x, ix, jx, descx, incx)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: N, ix, jx, descx(*), incx
+    real(kind=rk8)          :: norm2, x(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pdlaset(UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: M, N, IA, JA, DESCA(*)
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: A(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pdtran(M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: M, N, IA, JA, DESCA(*), IC, JC, DESCC(*)
+    real(kind=rk8)          :: ALPHA, BETA
+    real(kind=rk8)          :: A(*), C(*)
+    end subroutine
+  end interface
+
+  interface
+    function pdlange(NORM, M, N, A, IA, JA, DESCA, WORK) result(norm2)
+    use PRECISION_MODULE
+    implicit none
+    character               :: norm
+    integer(kind=BLAS_KIND) :: m, n, ia, ja, desca(*)
+    real(kind=rk8)          :: a(*), work(*)
+    real(kind=rk8)          :: norm2
+    end function
+  end interface
+
+
+
+
+  interface
+    subroutine psgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANSA, TRANSB
+    integer(kind=BLAS_KIND) :: M, N, K, IA, JA, DESCA(*), IB, JB, DESCB(*), IC, JC, DESCC(*)
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: A(*), B(*), C(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine psnrm2(N, norm2, x, ix, jx, descx, incx)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: N, ix, jx, descx(*), incx
+    real(kind=rk4)          :: norm2, x(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pslaset(UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: M, N, IA, JA, DESCA(*)
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: A(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pstran(M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: M, N, IA, JA, DESCA(*), IC, JC, DESCC(*)
+    real(kind=rk4)          :: ALPHA, BETA
+    real(kind=rk4)          :: A(*), C(*)
+    end subroutine
+  end interface
+
+  interface
+    function pslange(NORM, M, N, A, IA, JA, DESCA, WORK) result(norm2)
+    use PRECISION_MODULE
+    implicit none
+    character               :: norm
+    integer(kind=BLAS_KIND) :: m, n, ia, ja, desca(*)
+    real(kind=rk4)          :: a(*), work(*)
+    real(kind=rk4)          :: norm2
+    end function
+  end interface
+
+  interface
+    subroutine pzgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANSA, TRANSB
+    integer(kind=BLAS_KIND) :: M, N, K, IA, JA, DESCA(*), IB, JB, DESCB(*), IC, JC, DESCC(*)
+    complex(kind=ck8)       :: ALPHA, BETA
+    complex(kind=ck8)       :: A(*), B(*), C(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pzdotc(N, DOTC, X, ix, jx, descx, incx, Y, iy, jy, descy, incy)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: N, ix, jx, descx(*), incx, iy, jy, descy(*), incy
+    complex(kind=ck8)       :: DOTC
+    complex(kind=ck8)       :: X(*), Y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pzlaset(UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: M, N, IA, JA, DESCA(*)
+    complex(kind=ck8)       :: ALPHA, BETA
+    complex(kind=ck8)       :: A(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pztranc(M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: M, N, IA, JA, DESCA(*), IC, JC, DESCC(*)
+    complex(kind=ck8)       :: ALPHA, BETA
+    complex(kind=ck8)       :: A(*), C(*)
+    end subroutine
+  end interface
+
+  interface
+    function pzlange(NORM, M, N, A, IA, JA, DESCA, WORK) result(norm2)
+    use PRECISION_MODULE
+    implicit none
+    character               :: norm
+    integer(kind=BLAS_KIND) :: m, n, ia, ja, desca(*)
+    complex(kind=ck8)       :: a(*)
+    real(kind=rk8)          ::work(*)
+    real(kind=rk8)          :: norm2
+    end function
+  end interface
+
+  interface
+    subroutine pcgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC)
+    use PRECISION_MODULE
+    implicit none
+    character               :: TRANSA, TRANSB
+    integer(kind=BLAS_KIND) :: M, N, K, IA, JA, DESCA(*), IB, JB, DESCB(*), IC, JC, DESCC(*)
+    complex(kind=ck4)       :: ALPHA, BETA
+    complex(kind=ck4)       :: A(*), B(*), C(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pcdotc(N, DOTC, X, ix, jx, descx, incx, Y, iy, jy, descy, incy)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: N, ix, jx, descx(*), incx, iy, jy, descy(*), incy
+    complex(kind=ck4)       :: DOTC
+    complex(kind=ck4)       :: X(*), Y(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pclaset(UPLO, M, N, ALPHA, BETA, A, IA, JA, DESCA)
+    use PRECISION_MODULE
+    implicit none
+    character               :: UPLO
+    integer(kind=BLAS_KIND) :: M, N, IA, JA, DESCA(*)
+    complex(kind=ck4)       :: ALPHA, BETA
+    complex(kind=ck4)       :: A(*)
+    end subroutine
+  end interface
+
+  interface
+    subroutine pctranc(M, N, ALPHA, A, IA, JA, DESCA, BETA, C, IC, JC, DESCC)
+    use PRECISION_MODULE
+    implicit none
+    integer(kind=BLAS_KIND) :: M, N, IA, JA, DESCA(*), IC, JC, DESCC(*)
+    complex(kind=ck4)       :: ALPHA, BETA
+    complex(kind=ck4)       :: A(*), C(*)
+    end subroutine
+  end interface
+
+  interface
+    function pclange(NORM, M, N, A, IA, JA, DESCA, WORK) result(norm2)
+    use PRECISION_MODULE
+    implicit none
+    character               :: norm
+    integer(kind=BLAS_KIND) :: m, n, ia, ja, desca(*)
+    complex(kind=ck4)       :: a(*)
+    real(kind=rk4)          ::work(*)
+    real(kind=rk4)          :: norm2
+    end function
+  end interface
+
diff -Nru elpa-2016.05.001/src/helpers/get_cpuid_set.c elpa-2019.11.001/src/helpers/get_cpuid_set.c
--- elpa-2016.05.001/src/helpers/get_cpuid_set.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/get_cpuid_set.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,221 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA. If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "elpa/elpa_simd_constants.h"
+
+static inline void get_cpu_manufacturer(int *set)
+{
+  u_int32_t registers[4];
+  registers[0] = 0;
+  asm volatile("cpuid": "=a" (registers[0]),"=b" (registers[1]),"=c" (registrers[3]),"=d" (registers[2]): "0" (registers[0]), "2" (registers[2]): "memory");
+
+  char str[13]="GenuineIntel\0";
+  char manufacturer[13];
+
+  memcpy(manufacturer, registers[1], 12); 
+  manufacturer[12] = '\0';
+
+  if (strcmp(manufacturer, str) == 0) {
+    set[CPU_MANUFACTURER - 1] = 1;
+  } else { 
+    set[CPU_MANUFACTURER - 1] = 0;
+  }
+}
+
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+#include <cpuid.h>
+void cpuid(int info[4], int InfoType){
+    __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
+}
+#endif
+
+/*
+!f>#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+!f> interface
+!f>   subroutine get_cpuid_set(simdSet, n) &
+!f>              bind(C, name="get_cpuid_set")
+!f>     use, intrinsic :: iso_c_binding
+!f>     integer(kind=c_int), value :: n
+!f>     integer(kind=c_int)        :: simdSet(n)
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+void get_cpuid_set(int *set, int nlength){
+
+  get_cpu_manufacturer(set);
+
+
+  // Code below taken from http://stackoverflow.com/questions/6121792/how-to-check-if-a-cpu-supports-the-sse3-instruction-set/7495023#7495023
+
+  //  Misc.
+  bool HW_MMX;
+  bool HW_x64;
+  bool HW_ABM;      // Advanced Bit Manipulation
+  bool HW_RDRAND;
+  bool HW_BMI1;
+  bool HW_BMI2;
+  bool HW_ADX;
+  bool HW_PREFETCHWT1;
+  
+  //  SIMD: 128-bit
+  bool HW_SSE;
+  bool HW_SSE2;
+  bool HW_SSE3;
+  bool HW_SSSE3;
+  bool HW_SSE41;
+  bool HW_SSE42;
+  bool HW_SSE4a;
+  bool HW_AES;
+  bool HW_SHA;
+  
+  //  SIMD: 256-bit
+  bool HW_AVX;
+  bool HW_XOP;
+  bool HW_FMA3;
+  bool HW_FMA4;
+  bool HW_AVX2;
+  //  SIMD: 512-bit
+  bool HW_AVX512F;    //  AVX512 Foundation
+  bool HW_AVX512CD;   //  AVX512 Conflict Detection
+  bool HW_AVX512PF;   //  AVX512 Prefetch
+  bool HW_AVX512ER;   //  AVX512 Exponential + Reciprocal
+  bool HW_AVX512VL;   //  AVX512 Vector Length Extensions
+  bool HW_AVX512BW;   //  AVX512 Byte + Word
+  bool HW_AVX512DQ;   //  AVX512 Doubleword + Quadword
+  bool HW_AVX512IFMA; //  AVX512 Integer 52-bit Fused Multiply-Add
+  bool HW_AVX512VBMI; //  AVX512 Vector Byte Manipulation Instructions
+  
+  int info[4];
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+
+  cpuid(info, 0);
+  int nIds = info[0];
+  
+  cpuid(info, 0x80000000);
+  unsigned nExIds = info[0];
+#endif  
+  //  Detect Features
+  if (nIds >= 0x00000001){
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+    cpuid(info,0x00000001);
+#endif
+    HW_MMX    = (info[3] & ((int)1 << 23)) != 0;
+    HW_SSE    = (info[3] & ((int)1 << 25)) != 0;
+    HW_SSE2   = (info[3] & ((int)1 << 26)) != 0;
+    HW_SSE3   = (info[2] & ((int)1 <<  0)) != 0;
+
+    HW_SSSE3  = (info[2] & ((int)1 <<  9)) != 0;
+    HW_SSE41  = (info[2] & ((int)1 << 19)) != 0;
+    HW_SSE42  = (info[2] & ((int)1 << 20)) != 0;
+    HW_AES    = (info[2] & ((int)1 << 25)) != 0;
+
+    HW_AVX    = (info[2] & ((int)1 << 28)) != 0;
+    HW_FMA3   = (info[2] & ((int)1 << 12)) != 0;
+    HW_RDRAND = (info[2] & ((int)1 << 30)) != 0;
+  }
+  if (nIds >= 0x00000007){
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+    cpuid(info,0x00000007);
+#endif
+    HW_AVX2   = (info[1] & ((int)1 <<  5)) != 0;
+
+    HW_BMI1        = (info[1] & ((int)1 <<  3)) != 0;
+    HW_BMI2        = (info[1] & ((int)1 <<  8)) != 0;
+    HW_ADX         = (info[1] & ((int)1 << 19)) != 0;
+    HW_SHA         = (info[1] & ((int)1 << 29)) != 0;
+    HW_PREFETCHWT1 = (info[2] & ((int)1 <<  0)) != 0;
+
+    HW_AVX512F     = (info[1] & ((int)1 << 16)) != 0;
+    HW_AVX512CD    = (info[1] & ((int)1 << 28)) != 0;
+    HW_AVX512PF    = (info[1] & ((int)1 << 26)) != 0;
+    HW_AVX512ER    = (info[1] & ((int)1 << 27)) != 0;
+    HW_AVX512VL    = (info[1] & ((int)1 << 31)) != 0;
+    HW_AVX512BW    = (info[1] & ((int)1 << 30)) != 0;
+    HW_AVX512DQ    = (info[1] & ((int)1 << 17)) != 0;
+    HW_AVX512IFMA  = (info[1] & ((int)1 << 21)) != 0;
+    HW_AVX512VBMI  = (info[2] & ((int)1 <<  1)) != 0;
+  }
+
+  if (nExIds >= 0x80000001){
+#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
+    cpuid(info,0x80000001);
+#endif
+    HW_x64   = (info[3] & ((int)1 << 29)) != 0;
+    HW_ABM   = (info[2] & ((int)1 <<  5)) != 0;
+    HW_SSE4a = (info[2] & ((int)1 <<  6)) != 0;
+    HW_FMA4  = (info[2] & ((int)1 << 16)) != 0;
+    HW_XOP   = (info[2] & ((int)1 << 11)) != 0;
+  }
+
+  //allways allow GENERIC
+  set[GENERIC_INSTR -1] =1;
+
+  // the rest depends on the CPU
+  if (HW_SSE42) {
+    set[SSE_INSTR - 1] = 1;
+  }
+  if (HW_AVX) {
+    set[AVX_INSTR - 1] = 1;
+  }
+  if (HW_AVX2) {
+    set[AVX2_INSTR - 1] = 1;
+  }
+  if (HW_AVX512F) {
+    set[AVX512_INSTR -1] = 1;
+  }
+
+}
+
+
diff -Nru elpa-2016.05.001/src/helpers/lapack_interfaces.h elpa-2019.11.001/src/helpers/lapack_interfaces.h
--- elpa-2016.05.001/src/helpers/lapack_interfaces.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/lapack_interfaces.h	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,26 @@
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define C_INT_TYPE_PTR long int*
+#define C_INT_TYPE long int
+#else
+#define C_INT_TYPE_PTR int*
+#define C_INT_TYPE int
+#endif
+
+void dlacpy_(char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double*, C_INT_TYPE_PTR, double*, C_INT_TYPE_PTR);
+void dgemm_(char*, char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double*, double*, C_INT_TYPE_PTR, double*, C_INT_TYPE_PTR, double*, double*, C_INT_TYPE_PTR); 
+
+
+void slacpy_(char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float*, C_INT_TYPE_PTR, float*, C_INT_TYPE_PTR);
+void sgemm_(char*, char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float*, float*, C_INT_TYPE_PTR, float*, C_INT_TYPE_PTR, float*, float*, C_INT_TYPE_PTR); 
+
+
+
+
+void zlacpy_(char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double complex*, C_INT_TYPE_PTR, double complex*, C_INT_TYPE_PTR);
+void zgemm_(char*, char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double complex*, double complex*, C_INT_TYPE_PTR, double complex*, C_INT_TYPE_PTR, double complex*, double complex*, C_INT_TYPE_PTR); 
+
+
+void clacpy_(char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float complex*, C_INT_TYPE_PTR, float complex*, C_INT_TYPE_PTR);
+void cgemm_(char*, char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float complex*, float complex*, C_INT_TYPE_PTR, float complex*, C_INT_TYPE_PTR, float complex*, float complex*, C_INT_TYPE_PTR); 
+
+
diff -Nru elpa-2016.05.001/src/helpers/matrix_plot.F90 elpa-2019.11.001/src/helpers/matrix_plot.F90
--- elpa-2016.05.001/src/helpers/matrix_plot.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/matrix_plot.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,82 @@
+#define  REAL_DATATYPE rk8
+
+! module for producing matrix traces, to be plotted by provided python plotter
+! currently the module is very simple and non-flexible
+! it is only usable for printing the matrix A and possibly its counterpart A_DEV
+! both are assumed to be in block-cyclic distribution
+! At the moment, the output works for double real only
+! To simplify things, a convenience macro (as follows) can be placed in a template file:
+
+! #undef SAVE_MATR
+! #ifdef DOUBLE_PRECISION_REAL
+! #define SAVE_MATR(name, iteration) \
+! call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
+! #else
+! #define SAVE_MATR(name, iteration)
+! #endif
+
+! traces are stored into directory "matrices", that has to be created
+
+module matrix_plot
+
+  contains
+
+    subroutine prmat(na, useGpu, a_mat, a_dev, lda, matrixCols, nblk, my_prow, my_pcol, np_rows, np_cols, name, iteration)
+      use cuda_functions
+      use iso_c_binding
+      use precision
+      implicit none
+      integer, parameter :: out_unit=20
+      character(len = 1025) :: directory = "matrices"
+      character(len = 1024) :: filename
+
+      character(len = *), intent(in)             :: name
+      integer(kind=ik), intent(in)                  :: na, lda, nblk, matrixCols, my_prow, my_pcol, np_rows, np_cols, iteration
+      real(kind=REAL_DATATYPE), intent(in)          :: a_mat(lda,matrixCols)
+      integer(kind=C_intptr_T), intent(in)          :: a_dev
+      logical, intent(in)                           :: useGPU
+
+      integer(kind=ik)                              :: row, col, mpi_rank
+      integer(kind=ik), save                        :: counter = 0
+      real(kind=REAL_DATATYPE)                      :: a_dev_helper(lda,matrixCols)
+      logical                                       :: successCUDA
+      integer(kind=c_size_t), parameter             :: size_of_datatype = size_of_double_real
+
+      mpi_rank = np_rows * my_pcol + my_prow
+
+      ! print a_mat
+      write(filename, "(A,A,I0.4,A,I0.2,A)") trim(directory), "/a_mat-", counter, "-", mpi_rank, ".txt"
+      write(*,*) trim(filename)
+      open(unit=out_unit, file=trim(filename), action="write",status="replace")
+
+      write(out_unit, "(9I5)") na, nblk, lda, matrixCols, my_prow, my_pcol, np_rows, np_cols, iteration
+      write(out_unit, "(A)") name
+      do row = 1, lda
+          write(out_unit, *) a_mat(row, :)
+      end do
+      close(out_unit)
+
+      ! print a_dev
+
+      if(useGpu) then
+#ifdef HAVE_GPU_VERSION
+        successCUDA = cuda_memcpy(int(loc(a_dev_helper(1,1)),kind=c_intptr_t), &
+                      a_dev, lda * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
+#endif
+        write(filename, "(A,A,I0.4,A,I0.2,A)") trim(directory), "/a_dev-", counter, "-", mpi_rank, ".txt"
+        write(*,*) trim(filename)
+        open(unit=out_unit, file=trim(filename), action="write",status="replace")
+
+        write(out_unit, "(9I5)") na, nblk, lda, matrixCols, my_prow, my_pcol, np_rows, np_cols, iteration
+        write(out_unit, "(A)") name
+        do row = 1, lda
+            write(out_unit, *) a_dev_helper(row, :)
+        end do
+        close(out_unit)
+      end if
+
+      counter = counter + 1
+
+    end subroutine
+
+end module
diff -Nru elpa-2016.05.001/src/helpers/mod_blas_interfaces.F90 elpa-2019.11.001/src/helpers/mod_blas_interfaces.F90
--- elpa-2016.05.001/src/helpers/mod_blas_interfaces.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/mod_blas_interfaces.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,58 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+
+#include "config-f90.h"
+
+#define PRECISION_MODULE precision
+
+module elpa_blas_interfaces
+  use iso_c_binding
+  use precision
+
+  implicit none
+
+  public
+#include "./fortran_blas_interfaces.F90"
+
+end module
+
diff -Nru elpa-2016.05.001/src/helpers/mod_mpi.F90 elpa-2019.11.001/src/helpers/mod_mpi.F90
--- elpa-2016.05.001/src/helpers/mod_mpi.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/mod_mpi.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,60 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+!> \brief Fortran module which exports the MPI functions to ELPA
+module elpa_mpi
+#ifndef WITH_MPI
+  use elpa_mpi_stubs
+#else
+#ifdef HAVE_MPI_MODULE
+  use mpi
+  implicit none
+#else
+  implicit none
+  include 'mpif.h'
+#endif
+#endif
+  public
+end module
diff -Nru elpa-2016.05.001/src/helpers/mod_mpi_stubs.F90 elpa-2019.11.001/src/helpers/mod_mpi_stubs.F90
--- elpa-2016.05.001/src/helpers/mod_mpi_stubs.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/mod_mpi_stubs.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,103 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author Andreas Marek, MPCDF
+
+#include "config-f90.h"
+!> \brief Fortran module which exports the MPI stubs function, if ELPA as been built without MPI support
+module elpa_mpi_stubs
+  use precision
+  implicit none
+
+  public
+
+  integer(kind=MPI_KIND), parameter :: MPI_COMM_SELF=1, MPI_COMM_WORLD=1, MPI_SUCCESS=0
+
+  contains
+    function MPI_WTIME() result(time)
+#ifndef WITH_MPI
+      use time_c
+#endif
+      implicit none
+
+      real(kind=c_double) :: time
+#ifndef WITH_MPI
+      time = seconds()
+#endif
+    end function
+
+    subroutine mpi_comm_size(mpi_comm_world, ntasks, mpierr)
+      implicit none
+
+      integer(kind=MPI_KIND), intent(in)    :: mpi_comm_world
+      integer(kind=MPI_KIND), intent(inout) :: ntasks
+      integer(kind=MPI_KIND), intent(inout) :: mpierr
+
+      ntasks = 1
+      mpierr = 0
+
+      return
+
+    end subroutine mpi_comm_size
+
+    subroutine mpi_comm_rank(mpi_comm_world, myid, mpierr)
+      implicit none
+      integer(kind=MPI_KIND), intent(in)    :: mpi_comm_world
+      integer(kind=MPI_KIND), intent(inout) :: mpierr
+      integer(kind=MPI_KIND), intent(inout) :: myid
+
+      myid = 0
+      mpierr = 0
+
+      return
+    end subroutine mpi_comm_rank
+
+    subroutine mpi_comm_split(mpi_communicator, color, key, new_comm, mpierr)
+      implicit none
+      integer(kind=MPI_KIND), intent(in)    :: mpi_communicator, color, key
+      integer(kind=MPI_KIND), intent(inout) :: new_comm, mpierr
+
+      new_comm = mpi_communicator
+      mpierr = 0
+      return
+    end subroutine mpi_comm_split
+
+end module
diff -Nru elpa-2016.05.001/src/helpers/mod_omp.F90 elpa-2019.11.001/src/helpers/mod_omp.F90
--- elpa-2016.05.001/src/helpers/mod_omp.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/mod_omp.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,60 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: Andreas Marek, MPCDF
+
+#include "config-f90.h"
+
+!> \brief Fortran module which exports the MPI functions to ELPA
+module elpa_omp
+#ifdef WITH_OPENMP
+   use omp_lib
+#endif
+   use iso_c_binding
+   use precision
+   implicit none
+   public
+
+   integer(kind=ik) :: omp_threads_caller
+
+
+end module
+
diff -Nru elpa-2016.05.001/src/helpers/mod_precision.F90 elpa-2019.11.001/src/helpers/mod_precision.F90
--- elpa-2016.05.001/src/helpers/mod_precision.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/mod_precision.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,67 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+#include "config-f90.h"
+!> \brief Fortran module which defines the datatypes used in ELPA
+module precision
+  use iso_c_binding, only : C_FLOAT, C_DOUBLE, C_FLOAT_COMPLEX, C_DOUBLE_COMPLEX, C_INT32_T, C_INT64_T, C_INT
+
+  implicit none
+  integer, parameter :: rk8  = C_DOUBLE
+  integer, parameter :: rk4  = C_FLOAT
+  integer, parameter :: ck8  = C_DOUBLE_COMPLEX
+  integer, parameter :: ck4  = C_FLOAT_COMPLEX
+  integer, parameter :: ik  = C_INT32_T
+  integer, parameter :: lik = C_INT64_T
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+  integer, parameter :: BLAS_KIND = C_INT64_T
+#else
+  integer, parameter :: BLAS_KIND = C_INT32_T
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+  integer, parameter :: MPI_KIND  = C_INT64_T
+#else
+  integer, parameter :: MPI_KIND  = C_INT32_T
+#endif
+
+end module precision
diff -Nru elpa-2016.05.001/src/helpers/mod_scalapack_interfaces.F90 elpa-2019.11.001/src/helpers/mod_scalapack_interfaces.F90
--- elpa-2016.05.001/src/helpers/mod_scalapack_interfaces.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/mod_scalapack_interfaces.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,58 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+
+#include "config-f90.h"
+#define PRECISION_MODULE precision
+module elpa_scalapack_interfaces
+  use iso_c_binding
+  !use precision
+
+  implicit none
+
+  public
+
+#include "./fortran_scalapack_interfaces.F90"
+
+end module
+
+
diff -Nru elpa-2016.05.001/src/helpers/mod_simd_kernel.F90 elpa-2019.11.001/src/helpers/mod_simd_kernel.F90
--- elpa-2016.05.001/src/helpers/mod_simd_kernel.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/mod_simd_kernel.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,171 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+#include "config-f90.h"
+#include "elpa/elpa_simd_constants.h"
+
+module simd_kernel
+  use elpa_constants
+  use iso_c_binding
+
+  integer(kind=c_int) :: realKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_REAL_KERNELS)
+  integer(kind=c_int) :: simdTable_to_realKernels(NUMBER_OF_INSTR)
+  integer(kind=c_int) :: complexKernels_to_simdTable(ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS)
+  integer(kind=c_int) :: simdTable_to_complexKernels(NUMBER_OF_INSTR)
+
+  contains
+
+  function map_real_kernel_to_simd_instruction(kernel) result(simd_set_index)
+    
+    use iso_c_binding
+    implicit none
+
+    integer(kind=c_int), intent(in) :: kernel
+    integer(kind=c_int)             :: simd_set_index
+
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC)               = GENERIC_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE)        = GENERIC_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_BGP)                   = BLUEGENE_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_BGQ)                   = BLUEGENE_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_ASSEMBLY)          = SSE_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK2)            = SSE_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK4)            = SSE_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_SSE_BLOCK6)            = SSE_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK2)            = AVX_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK4)            = AVX_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX_BLOCK6)            = AVX_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK2)           = AVX2_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK4)           = AVX2_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX2_BLOCK6)           = AVX2_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK2)         = AVX2_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK4)         = AVX2_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_AVX512_BLOCK6)         = AVX2_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_GPU)                   = NVIDIA_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK2)        = SPARC_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK4)        = SPARC_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK6)        = SPARC_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2)    = ARCH64_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4)    = ARCH64_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6)    = ARCH64_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK2)            = VSX_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK4)            = VSX_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_VSX_BLOCK6)            = VSX_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4) = GENERIC_INSTR
+    realKernels_to_simdTable(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) = GENERIC_INSTR
+
+    simd_set_index = realKernels_to_simdTable(kernel)
+
+
+  end
+
+  function map_simd_instruction_to_real_kernel(simd_set_index) result(kernel)
+    
+    use iso_c_binding
+    implicit none
+
+
+    integer(kind=c_int)                        :: kernel
+    integer(kind=c_int), intent(in)            :: simd_set_index
+
+    simdTable_to_realKernels(GENERIC_INSTR)  = ELPA_2STAGE_REAL_GENERIC
+    simdTable_to_realKernels(BLUEGENE_INSTR) = ELPA_2STAGE_REAL_BGP
+    simdTable_to_realKernels(SSE_INSTR)      = ELPA_2STAGE_REAL_SSE_BLOCK2
+    simdTable_to_realKernels(AVX_INSTR)      = ELPA_2STAGE_REAL_AVX_BLOCK2
+    simdTable_to_realKernels(AVX2_INSTR)     = ELPA_2STAGE_REAL_AVX2_BLOCK2
+    simdTable_to_realKernels(AVX512_INSTR)   = ELPA_2STAGE_REAL_AVX512_BLOCK2
+    simdTable_to_realKernels(NVIDIA_INSTR)   = ELPA_2STAGE_REAL_GPU
+    simdTable_to_realKernels(SPARC_INSTR)    = ELPA_2STAGE_REAL_SPARC64_BLOCK2
+    simdTable_to_realKernels(ARCH64_INSTR)   = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
+    simdTable_to_realKernels(VSX_INSTR)      = ELPA_2STAGE_REAL_VSX_BLOCK2
+
+    kernel = simdTable_to_realKernels(simd_set_index)
+
+  end
+
+  function map_complex_kernel_to_simd_instruction(kernel) result(simd_set_index)
+    
+    use iso_c_binding
+    implicit none
+    integer(kind=c_int), intent(in)  :: kernel
+    integer(kind=c_int)              :: simd_set_index
+
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC)        = GENERIC_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) = GENERIC_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGP)            = BLUEGENE_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_BGQ)            = BLUEGENE_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY)   = SSE_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK1)     = SSE_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SSE_BLOCK2)     = SSE_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK1)     = AVX_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX_BLOCK2)     = AVX_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)    = AVX2_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX2_BLOCK2)    = AVX2_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)  = AVX512_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_AVX512_BLOCK2)  = AVX512_INSTR
+    complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GPU)            = NVIDIA_INSTR
+    
+
+    simd_set_index = complexKernels_to_simdTable(kernel)
+
+  end
+
+  function map_simd_instruction_to_complex_kernel(simd_set_index) result(kernel)
+    
+    use iso_c_binding
+    implicit none
+    integer(kind=c_int)              :: kernel
+    integer(kind=c_int), intent(in)  :: simd_set_index
+
+    simdTable_to_complexKernels(GENERIC_INSTR) = ELPA_2STAGE_COMPLEX_GENERIC
+    simdTable_to_complexKernels(BLUEGENE_INSTR) = ELPA_2STAGE_COMPLEX_BGP
+    simdTable_to_complexKernels(SSE_INSTR) = ELPA_2STAGE_COMPLEX_SSE_BLOCK1
+    simdTable_to_complexKernels(AVX_INSTR) = ELPA_2STAGE_COMPLEX_AVX_BLOCK1
+    simdTable_to_complexKernels(AVX2_INSTR) = ELPA_2STAGE_COMPLEX_AVX2_BLOCK1
+    simdTable_to_complexKernels(AVX512_INSTR) = ELPA_2STAGE_COMPLEX_AVX512_BLOCK1
+    simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_GPU
+
+    kernel = simdTable_to_complexKernels(simd_set_index)
+
+  end
+
+end module
+
diff -Nru elpa-2016.05.001/src/helpers/mod_time_c.F90 elpa-2019.11.001/src/helpers/mod_time_c.F90
--- elpa-2016.05.001/src/helpers/mod_time_c.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/mod_time_c.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,67 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: Lorenz Huedepohl, MPCDF
+
+#include "config-f90.h"
+
+module time_c
+
+  use precision
+  use, intrinsic :: iso_c_binding
+
+  interface
+    function microseconds_since_epoch() result(ms) bind(C, name="ftimings_microseconds_since_epoch")
+      use, intrinsic :: iso_c_binding
+      implicit none
+      integer(kind=C_INT64_T) :: ms
+    end function
+  end interface
+
+  interface
+    function seconds() result(s) bind(C, name="seconds")
+      use, intrinsic :: iso_c_binding
+      implicit none
+      real(kind=C_DOUBLE) :: s
+    end function
+  end interface
+
+end module time_c
diff -Nru elpa-2016.05.001/src/helpers/print_build_config.c elpa-2019.11.001/src/helpers/print_build_config.c
--- elpa-2016.05.001/src/helpers/print_build_config.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/print_build_config.c	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,73 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//
+//    This particular source code file contains additions, changes and
+//    enhancements authored by Intel Corporation which is not part of
+//    the ELPA consortium.
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA. If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+// Author: Andreas Marek, MPCDF
+
+
+#include "config.h"
+#include "elpa/elpa_build_config.h"
+#include <stdio.h>
+
+
+/*
+!f>#ifdef STORE_BUILD_CONFIG
+!f> interface
+!f>   subroutine print_build_config() &
+!f>              bind(C, name="print_build_config")
+!f>        use, intrinsic :: iso_c_binding
+!f>   end subroutine
+!f> end interface
+!f>#endif
+*/
+
+void print_build_config(){
+#ifdef STORE_BUILD_CONFIG
+  printf("===============================================================\n");
+  printf("    Output of the autoconf config.log created at build time    \n\n");
+  printf(" In case of troubles with the ELPA library, please send the \n follwing output together with a problem description \n at elpa-library@mpcdf.mpg.de \n\n");
+  printf("%s \n",elpa_build_object);
+  printf("===============================================================\n");
+#endif
+}
diff -Nru elpa-2016.05.001/src/helpers/scalapack_interfaces.h elpa-2019.11.001/src/helpers/scalapack_interfaces.h
--- elpa-2016.05.001/src/helpers/scalapack_interfaces.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/scalapack_interfaces.h	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,28 @@
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define C_INT_TYPE_PTR long int*
+#define C_INT_TYPE long int
+#else
+#define C_INT_TYPE_PTR int*
+#define C_INT_TYPE int
+#endif
+
+C_INT_TYPE numroc_(C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+
+
+void pdlacpy_(char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+void pdtran_(C_INT_TYPE_PTR, C_INT_TYPE_PTR, double*, double*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double*, double*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+
+
+void pslacpy_(char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+void pstran_(C_INT_TYPE_PTR, C_INT_TYPE_PTR, float*, float*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float*, float*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+
+
+
+void pzlacpy_(char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double complex*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double complex*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+void pztranc_(C_INT_TYPE_PTR, C_INT_TYPE_PTR, double complex*, double complex*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, double complex*, double complex*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+
+
+void pclacpy_(char*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float complex*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float complex*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+void pctranc_(C_INT_TYPE_PTR, C_INT_TYPE_PTR, float complex*, float complex*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR, float complex*, float complex*, C_INT_TYPE_PTR, C_INT_TYPE_PTR, C_INT_TYPE_PTR);
+
+
diff -Nru elpa-2016.05.001/src/helpers/timer_dummy.F90 elpa-2019.11.001/src/helpers/timer_dummy.F90
--- elpa-2016.05.001/src/helpers/timer_dummy.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/src/helpers/timer_dummy.F90	2019-12-19 09:47:42.000000000 +0000
@@ -0,0 +1,132 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!    This particular source code file contains additions, changes and
+!    enhancements authored by Intel Corporation which is not part of
+!    the ELPA consortium.
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+! Author: P. Kus, MPCDF
+
+#include "config-f90.h"
+
+module timings_dummy
+  implicit none
+  
+  type, public :: timer_dummy_t
+      contains
+      procedure, pass :: start => timer_start
+      procedure, pass :: stop => timer_stop
+      procedure, pass :: enable => timer_enable
+      procedure, pass :: free => timer_free
+      procedure, pass :: print => timer_print
+      procedure, pass :: measure_flops => timer_measure_flops
+      procedure, pass :: set_print_options => timer_set_print_options
+  end type 
+
+  type(timer_dummy_t) :: timer
+  type(timer_dummy_t) :: autotune_timer
+
+  contains
+
+  subroutine timer_print(self, name)
+    class(timer_dummy_t), intent(inout), target :: self
+    character(len=*), intent(in)  :: name
+    
+  end subroutine
+
+  subroutine timer_start(self, name, replace)
+    class(timer_dummy_t), intent(inout), target :: self
+    character(len=*), intent(in)  :: name
+    logical, intent(in), optional  :: replace
+    
+  end subroutine
+  
+  subroutine timer_stop(self, name)
+    class(timer_dummy_t), intent(inout), target :: self
+    character(len=*), intent(in), optional :: name
+    
+  end subroutine
+
+  subroutine timer_enable(self)
+    class(timer_dummy_t), intent(inout), target :: self
+    
+  end subroutine
+
+  subroutine timer_measure_flops(self, enable)
+    class(timer_dummy_t), intent(inout), target :: self
+    logical                                     :: enable
+  end subroutine
+
+  subroutine timer_set_print_options(self, print_allocated_memory, &
+        print_virtual_memory, &
+        print_max_allocated_memory, &
+        print_flop_count, &
+        print_flop_rate, &
+        print_ldst, &
+        print_memory_bandwidth, &
+        print_ai, &
+        bytes_per_ldst)
+
+    class(timer_dummy_t), intent(inout), target :: self
+    logical, intent(in), optional :: &
+        print_allocated_memory, &
+        print_virtual_memory, &
+        print_max_allocated_memory, &
+        print_flop_count, &
+        print_flop_rate, &
+        print_ldst, &
+        print_memory_bandwidth, &
+        print_ai
+    integer, intent(in), optional :: bytes_per_ldst
+  end subroutine
+
+  subroutine timer_free(self)
+    class(timer_dummy_t), intent(inout), target :: self
+    
+  end subroutine
+end module timings_dummy
diff -Nru elpa-2016.05.001/src/mod_compute_hh_trafo_complex.F90 elpa-2019.11.001/src/mod_compute_hh_trafo_complex.F90
--- elpa-2016.05.001/src/mod_compute_hh_trafo_complex.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/mod_compute_hh_trafo_complex.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,362 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MPCDF
-
-module compute_hh_trafo_complex
-#include "config-f90.h"
-  use elpa_mpi
-  implicit none
-
-#ifdef WITH_OPENMP
-  public compute_hh_trafo_complex_cpu_openmp
-#else
-  public compute_hh_trafo_complex_cpu
-#endif
-
-
-  contains
-
-#ifdef WITH_OPENMP
-         subroutine compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
-                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                        off, ncols, istripe,                                               &
-                                                        my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
-#else
-         subroutine compute_hh_trafo_complex_cpu       (a, stripe_width, a_dim2, stripe_count,                             &
-                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                        off, ncols, istripe, last_stripe_width,                            &
-                                                        THIS_COMPLEX_ELPA_KERNEL)
-#endif
-           use precision
-           use elpa2_utilities
-#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
-           use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple
-#endif
-#if defined(WITH_COMPLEX_GENERIC_KERNEL)
-           use complex_generic_kernel, only : single_hh_trafo_complex_generic
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-           use timings
-#endif
-
-#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
-         use kernel_interfaces
-#endif
-           implicit none
-           real(kind=rk), intent(inout) :: kernel_time
-           integer(kind=lik)            :: kernel_flops
-           integer(kind=ik), intent(in) :: nbw, max_blk_size
-           complex(kind=ck)             :: bcast_buffer(nbw,max_blk_size)
-           integer(kind=ik), intent(in) :: a_off
-
-           integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
-#ifndef WITH_OPENMP
-           integer(kind=ik), intent(in) :: last_stripe_width
-           complex(kind=ck)             :: a(stripe_width,a_dim2,stripe_count)
-#else
-           integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
-           complex(kind=ck)             :: a(stripe_width,a_dim2,stripe_count,max_threads)
-#endif
-           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
-
-           ! Private variables in OMP regions (my_thread) should better be in the argument list!
-
-           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
-#ifdef WITH_OPENMP
-           integer(kind=ik)             :: my_thread, noff
-#endif
-           real(kind=rk)                :: ttt
-
-           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-           !        Currently (on Sandy Bridge), single is faster than double
-           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-           complex(kind=ck)             :: w(nbw,2)
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-          call timer%start("compute_hh_trafo_complex_cpu_openmp")
-#else
-          call timer%start("compute_hh_trafo_complex_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-           if (istripe<stripe_count) then
-             nl = stripe_width
-           else
-             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
-             nl = min(my_thread*thread_width-noff, l_nev-noff)
-             if(nl<=0) then
-#ifdef HAVE_DETAILED_TIMINGS
-               call timer%stop("compute_hh_trafo_complex_cpu_openmp")
-#endif
-               return
-             endif
-           endif
-#else
-           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
-#endif
-
-#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
-#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#endif
-             enddo
-#ifdef WITH_OPENMP
-             if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe,my_thread), &
-                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-#else
-             if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe), &
-                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-#endif
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           endif
-#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
-
-#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
-                (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then
-#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#endif
-             enddo
-#ifdef WITH_OPENMP
-             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe,my_thread), &
-                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-#else
-             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe), &
-                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
-#endif
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           endif
-#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL */
-
-
-#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe,my_thread), &
-                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-               call single_hh_trafo_complex_generic_simple(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
-                                                           bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
-#endif
-
-#else /* WITH_OPENMP */
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe), &
-                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-               call single_hh_trafo_complex_generic_simple(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
-                                                          bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
-#endif
-
-#endif /* WITH_OPENMP */
-             enddo
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
-
-
-#if defined(WITH_COMPLEX_GENERIC_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
-               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
-               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-
-              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe,my_thread), &
-                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
-                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
-#endif
-
-#else /* WITH_OPENMP */
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe), &
-                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
-                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
-#endif
-#endif /* WITH_OPENMP */
-
-            enddo
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_GENERIC_KERNEL */
-
-#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-             ttt = mpi_wtime()
-             do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-              call single_hh_trafo_complex(a(1,j+off+a_off,istripe,my_thread), &
-                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex(a(1,j+off+a_off,istripe), &
-                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#endif
-            enddo
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
-
-
-!#if defined(WITH_AVX_SANDYBRIDGE)
-!              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-
-!#if defined(WITH_AMD_BULLDOZER)
-!              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
-!#endif
-
-#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
-            ttt = mpi_wtime()
-            do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-              call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe,my_thread), &
-                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe), &
-                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#endif
-            enddo
-#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
-
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
-
-#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
-              (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
-            ttt = mpi_wtime()
-            do j = ncols, 1, -1
-#ifdef WITH_OPENMP
-              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe,my_thread), &
-                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#else
-              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe), &
-                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
-#endif
-            enddo
-#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
-
-#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
-          endif
-#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
-#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
-
-#ifdef WITH_OPENMP
-          if (my_thread==1) then
-#endif
-            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
-            kernel_time  = kernel_time + mpi_wtime()-ttt
-#ifdef WITH_OPENMP
-          endif
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-          call timer%stop("compute_hh_trafo_complex_cpu_openmp")
-#else
-          call timer%stop("compute_hh_trafo_complex_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-        end subroutine compute_hh_trafo_complex_cpu_openmp
-#else
-        end subroutine compute_hh_trafo_complex_cpu
-
-#endif
-
-end module
diff -Nru elpa-2016.05.001/src/mod_compute_hh_trafo_real.F90 elpa-2019.11.001/src/mod_compute_hh_trafo_real.F90
--- elpa-2016.05.001/src/mod_compute_hh_trafo_real.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/mod_compute_hh_trafo_real.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,600 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MPCDF
-
-module compute_hh_trafo_real
-#include "config-f90.h"
-  use elpa_mpi
-  implicit none
-
-#ifdef WITH_OPENMP
-  public compute_hh_trafo_real_cpu_openmp
-#else
-  public compute_hh_trafo_real_cpu
-#endif
-
-  contains
-
-#ifdef WITH_OPENMP
-       subroutine compute_hh_trafo_real_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
-                                                   a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
-                                                   off, ncols, istripe,                                               &
-                                                   my_thread, thread_width,  THIS_REAL_ELPA_KERNEL)
-#else
-       subroutine compute_hh_trafo_real_cpu       (a, stripe_width, a_dim2, stripe_count,                              &
-                                                   a_off, nbw, max_blk_size, bcast_buffer,  kernel_flops, kernel_time, &
-                                                   off, ncols, istripe, last_stripe_width,                             &
-                                                   THIS_REAL_ELPA_KERNEL)
-#endif
-
-
-         use precision
-         use elpa2_utilities
-         use single_hh_trafo_real
-#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
-         use real_generic_simple_kernel, only : double_hh_trafo_generic_simple
-#endif
-
-#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE))
-        use real_generic_kernel, only : double_hh_trafo_generic
-#endif
-
-#if defined(WITH_REAL_BGP_KERNEL)
-         use real_bgp_kernel, only : double_hh_trafo_bgp
-#endif
-
-#if defined(WITH_REAL_BGQ_KERNEL)
-         use real_bgq_kernel, only : double_hh_trafo_bgq
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-         use timings
-#endif
-
-#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
-         use kernel_interfaces
-#endif
-         implicit none
-         real(kind=rk), intent(inout) :: kernel_time
-         integer(kind=lik)            :: kernel_flops
-         integer(kind=ik), intent(in) :: nbw, max_blk_size
-         real(kind=rk)                :: bcast_buffer(nbw,max_blk_size)
-         integer(kind=ik), intent(in) :: a_off
-
-         integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
-
-#ifndef WITH_OPENMP
-         integer(kind=ik), intent(in) :: last_stripe_width
-         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count)
-#else
-         integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
-         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
-#endif
-         integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
-
-         ! Private variables in OMP regions (my_thread) should better be in the argument list!
-         integer(kind=ik)             :: off, ncols, istripe
-#ifdef WITH_OPENMP
-         integer(kind=ik)             :: my_thread, noff
-#endif
-         integer(kind=ik)             :: j, nl, jj, jjj
-         real(kind=rk)                :: w(nbw,6), ttt
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-         call timer%start("compute_hh_trafo_real_cpu_openmp")
-#else
-         call timer%start("compute_hh_trafo_real_cpu")
-#endif
-#endif
-         ttt = mpi_wtime()
-
-#ifndef WITH_OPENMP
-         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
-#else
-
-         if (istripe<stripe_count) then
-           nl = stripe_width
-         else
-           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
-           nl = min(my_thread*thread_width-noff, l_nev-noff)
-           if (nl<=0) then
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-             call timer%stop("compute_hh_trafo_real_cpu_openmp")
-#else
-             call timer%stop("compute_hh_trafo_real_cpu")
-#endif
-#endif
-             return
-           endif
-         endif
-#endif
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2 .or. &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2 .or. &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC    .or. &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or.        &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP .or.        &
-             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-           !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
-#if defined(WITH_REAL_GENERIC_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-
-#ifdef WITH_OPENMP
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe,my_thread), w, &
-                                            nbw, nl, stripe_width, nbw)
-
-#else
-               call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, &
-                                              istripe,my_thread), w(1:nbw,1:6), &
-                                              nbw, nl, stripe_width, nbw)
-#endif
-
-#else /* WITH_OPENMP */
-
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe),w, &
-                                            nbw, nl, stripe_width, nbw)
-
-#else
-               call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), &
-                                            nbw, nl, stripe_width, nbw)
-#endif
-#endif /* WITH_OPENMP */
-
-             enddo
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_GENERIC_KERNEL */
-
-
-#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe,my_thread), &
-                                                     w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), &
-                                                     w, nbw, nl, stripe_width, nbw)
-
-#endif
-
-#else /* WITH_OPENMP */
-#ifdef DESPERATELY_WANT_ASSUMED_SIZE
-               call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe), &
-                                                     w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), &
-                                                     w, nbw, nl, stripe_width, nbw)
-
-#endif
-
-#endif /* WITH_OPENMP */
-
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */
-
-
-#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
-                                      stripe_width, nbw)
-#else
-               call double_hh_trafo(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
-                                      stripe_width, nbw)
-#endif
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */
-
-#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_real_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_real_sse_2hv(a(1,j+off+a_off-1,istripe), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#endif
-             enddo
-#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
-
-#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
-               (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2))  then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#else
-               call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), &
-                                                       w, nbw, nl, stripe_width, nbw)
-#endif
-             enddo
-#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
-
-#if defined(WITH_REAL_BGP_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
-                                          stripe_width, nbw)
-#else
-               call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
-                                          stripe_width, nbw)
-#endif
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_BGP_KERNEL */
-
-
-#if defined(WITH_REAL_BGQ_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-             do j = ncols, 2, -2
-               w(:,1) = bcast_buffer(1:nbw,j+off)
-               w(:,2) = bcast_buffer(1:nbw,j+off-1)
-#ifdef WITH_OPENMP
-               call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
-                                          stripe_width, nbw)
-#else
-               call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
-                                          stripe_width, nbw)
-#endif
-             enddo
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-           endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_BGQ_KERNEL */
-
-
-!#if defined(WITH_AVX_SANDYBRIDGE)
-!              call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
-!#endif
-
-#ifdef WITH_OPENMP
-           if (j==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
-                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
-                                      stripe_width)
-#else
-           if (j==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe),           &
-                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
-                                      stripe_width)
-#endif
-
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         endif !
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-
-
-#if defined(WITH_REAL_SSE_BLOCK4_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
-           ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
-           do j = ncols, 4, -4
-             w(:,1) = bcast_buffer(1:nbw,j+off)
-             w(:,2) = bcast_buffer(1:nbw,j+off-1)
-             w(:,3) = bcast_buffer(1:nbw,j+off-2)
-             w(:,4) = bcast_buffer(1:nbw,j+off-3)
-#ifdef WITH_OPENMP
-             call quad_hh_trafo_real_sse_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call quad_hh_trafo_real_sse_4hv(a(1,j+off+a_off-3,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jj = j, 2, -2
-             w(:,1) = bcast_buffer(1:nbw,jj+off)
-             w(:,2) = bcast_buffer(1:nbw,jj+off-1)
-#ifdef WITH_OPENMP
-             call double_hh_trafo_real_sse_2hv(a(1,jj+off+a_off-1,istripe,my_thread), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#else
-             call double_hh_trafo_real_sse_2hv(a(1,jj+off+a_off-1,istripe), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-#ifdef WITH_OPENMP
-           if (jj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
-                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#else
-           if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
-                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#endif
-
-#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
-
-#if defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or. &
-             (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
-           ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
-           do j = ncols, 4, -4
-             w(:,1) = bcast_buffer(1:nbw,j+off)
-             w(:,2) = bcast_buffer(1:nbw,j+off-1)
-             w(:,3) = bcast_buffer(1:nbw,j+off-2)
-             w(:,4) = bcast_buffer(1:nbw,j+off-3)
-#ifdef WITH_OPENMP
-             call quad_hh_trafo_real_avx_avx2_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call quad_hh_trafo_real_avx_avx2_4hv(a(1,j+off+a_off-3,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jj = j, 2, -2
-             w(:,1) = bcast_buffer(1:nbw,jj+off)
-             w(:,2) = bcast_buffer(1:nbw,jj+off-1)
-#ifdef WITH_OPENMP
-             call double_hh_trafo_real_avx_avx2_2hv(a(1,jj+off+a_off-1,istripe,my_thread), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#else
-             call double_hh_trafo_real_avx_avx2_2hv(a(1,jj+off+a_off-1,istripe), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-#ifdef WITH_OPENMP
-           if (jj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
-                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#else
-           if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
-                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#endif
-
-#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */
-
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
-
-#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK6) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-           ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
-           do j = ncols, 6, -6
-             w(:,1) = bcast_buffer(1:nbw,j+off)
-             w(:,2) = bcast_buffer(1:nbw,j+off-1)
-             w(:,3) = bcast_buffer(1:nbw,j+off-2)
-             w(:,4) = bcast_buffer(1:nbw,j+off-3)
-             w(:,5) = bcast_buffer(1:nbw,j+off-4)
-             w(:,6) = bcast_buffer(1:nbw,j+off-5)
-#ifdef WITH_OPENMP
-             call hexa_hh_trafo_real_sse_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call hexa_hh_trafo_real_sse_6hv(a(1,j+off+a_off-5,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jj = j, 4, -4
-             w(:,1) = bcast_buffer(1:nbw,jj+off)
-             w(:,2) = bcast_buffer(1:nbw,jj+off-1)
-             w(:,3) = bcast_buffer(1:nbw,jj+off-2)
-             w(:,4) = bcast_buffer(1:nbw,jj+off-3)
-#ifdef WITH_OPENMP
-             call quad_hh_trafo_real_sse_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call quad_hh_trafo_real_sse_4hv(a(1,jj+off+a_off-3,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jjj = jj, 2, -2
-             w(:,1) = bcast_buffer(1:nbw,jjj+off)
-             w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
-#ifdef WITH_OPENMP
-             call double_hh_trafo_real_sse_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#else
-             call double_hh_trafo_real_sse_2hv(a(1,jjj+off+a_off-1,istripe), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-#ifdef WITH_OPENMP
-           if (jjj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
-                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#else
-           if (jjj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
-                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#endif
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */
-
-#if defined(WITH_REAL_AVX_BLOCK6_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) .or. &
-             (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK6)) then
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-           ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
-           do j = ncols, 6, -6
-             w(:,1) = bcast_buffer(1:nbw,j+off)
-             w(:,2) = bcast_buffer(1:nbw,j+off-1)
-             w(:,3) = bcast_buffer(1:nbw,j+off-2)
-             w(:,4) = bcast_buffer(1:nbw,j+off-3)
-             w(:,5) = bcast_buffer(1:nbw,j+off-4)
-             w(:,6) = bcast_buffer(1:nbw,j+off-5)
-#ifdef WITH_OPENMP
-             call hexa_hh_trafo_real_avx_avx2_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call hexa_hh_trafo_real_avx_avx2_6hv(a(1,j+off+a_off-5,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jj = j, 4, -4
-             w(:,1) = bcast_buffer(1:nbw,jj+off)
-             w(:,2) = bcast_buffer(1:nbw,jj+off-1)
-             w(:,3) = bcast_buffer(1:nbw,jj+off-2)
-             w(:,4) = bcast_buffer(1:nbw,jj+off-3)
-#ifdef WITH_OPENMP
-             call quad_hh_trafo_real_avx_avx2_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#else
-             call quad_hh_trafo_real_avx_avx2_4hv(a(1,jj+off+a_off-3,istripe), w, &
-                                                  nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-           do jjj = jj, 2, -2
-             w(:,1) = bcast_buffer(1:nbw,jjj+off)
-             w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
-#ifdef WITH_OPENMP
-             call double_hh_trafo_real_avx_avx2_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#else
-             call double_hh_trafo_real_avx_avx2_2hv(a(1,jjj+off+a_off-1,istripe), &
-                                                    w, nbw, nl, stripe_width, nbw)
-#endif
-           enddo
-#ifdef WITH_OPENMP
-           if (jjj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
-                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#else
-           if (jjj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
-                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
-#endif
-#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
-         endif
-#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
-#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */
-
-#ifdef WITH_OPENMP
-         if (my_thread==1) then
-#endif
-           kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
-           kernel_time = kernel_time + mpi_wtime()-ttt
-#ifdef WITH_OPENMP
-         endif
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-         call timer%stop("compute_hh_trafo_real_cpu_openmp")
-#else
-         call timer%stop("compute_hh_trafo_real_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-       end subroutine compute_hh_trafo_real_cpu_openmp
-#else
-       end subroutine compute_hh_trafo_real_cpu
-#endif
-
-end module
diff -Nru elpa-2016.05.001/src/mod_mpi.F90 elpa-2019.11.001/src/mod_mpi.F90
--- elpa-2016.05.001/src/mod_mpi.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/src/mod_mpi.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,55 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MPCDF
-
-#include "config-f90.h"
-
-module elpa_mpi
-#ifndef WITH_MPI
-  use elpa_mpi_stubs
-#else
-  implicit none
-  public
-  include "mpif.h"
-#endif
-
-end module
diff -Nru elpa-2016.05.001/src/mod_mpi_stubs.F90 elpa-2019.11.001/src/mod_mpi_stubs.F90
--- elpa-2016.05.001/src/mod_mpi_stubs.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/src/mod_mpi_stubs.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,109 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author Andreas Marek, MPCDF
-
-#include "config-f90.h"
-
-module elpa_mpi_stubs
-  use precision
-  implicit none
-
-  public
-
-  integer(kind=ik), parameter :: MPI_COMM_SELF=1, MPI_COMM_WORLD=1
-
-  contains
-    function MPI_WTIME() result(time)
-      use iso_c_binding
-#ifndef WITH_MPI
-      use time_c
-#endif
-      implicit none
-
-      real(kind=c_double) :: time
-#ifndef WITH_MPI
-      time = seconds()
-#endif
-    end function
-
-    subroutine mpi_comm_size(mpi_comm_world, ntasks, mpierr)
-
-      use precision
-
-      implicit none
-
-      integer(kind=ik), intent(in)    :: mpi_comm_world
-      integer(kind=ik), intent(inout) :: ntasks
-      integer(kind=ik), intent(inout) :: mpierr
-
-      ntasks = 1
-      mpierr = 0
-
-      return
-
-    end subroutine mpi_comm_size
-
-    subroutine mpi_comm_rank(mpi_comm_world, myid, mpierr)
-      use precision
-      implicit none
-      integer(kind=ik), intent(in)    :: mpi_comm_world
-      integer(kind=ik), intent(inout) :: mpierr
-      integer(kind=ik), intent(inout) :: myid
-
-      myid = 0
-      mpierr = 0
-
-      return
-    end subroutine mpi_comm_rank
-
-    subroutine mpi_comm_split(mpi_communicator, color, key, new_comm, mpierr)
-      use precision
-      implicit none
-      integer(kind=ik), intent(in)    :: mpi_communicator, color, key
-      integer(kind=ik), intent(inout) :: new_comm, mpierr
-
-      new_comm = mpi_communicator
-      mpierr = 0
-      return
-    end subroutine mpi_comm_split
-
-end module
diff -Nru elpa-2016.05.001/src/mod_pack_unpack_complex.F90 elpa-2019.11.001/src/mod_pack_unpack_complex.F90
--- elpa-2016.05.001/src/mod_pack_unpack_complex.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/mod_pack_unpack_complex.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,173 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MPCDF
-
-module pack_unpack_complex
-#include "config-f90.h"
-  implicit none
-
-#ifdef WITH_OPENMP
-  public pack_row_complex_cpu_openmp
-#else
-  public pack_row_complex_cpu
-#endif
-  contains
-#ifdef WITH_OPENMP
-         subroutine pack_row_complex_cpu_openmp(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-         subroutine pack_row_complex_cpu(a, row, n, stripe_width, last_stripe_width, stripe_count)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-           use timings
-#endif
-           use precision
-           implicit none
-#ifdef WITH_OPENMP
-           integer(kind=ik), intent(in) :: stripe_width, stripe_count, max_threads, thread_width, l_nev
-           complex(kind=ck), intent(in) :: a(:,:,:,:)
-#else
-           integer(kind=ik), intent(in) :: stripe_width, last_stripe_width, stripe_count
-           complex(kind=ck), intent(in) :: a(:,:,:)
-#endif
-           complex(kind=ck)             :: row(:)
-           integer(kind=ik)             :: n, i, noff, nl, nt
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-           call timer%start("pack_row_complex_cpu_openmp")
-#else
-           call timer%start("pack_row_complex_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-           do nt = 1, max_threads
-             do i = 1, stripe_count
-               noff = (nt-1)*thread_width + (i-1)*stripe_width
-               nl   = min(stripe_width, nt*thread_width-noff, l_nev-noff)
-               if (nl<=0) exit
-               row(noff+1:noff+nl) = a(1:nl,n,i,nt)
-             enddo
-           enddo
-#else
-           do i=1,stripe_count
-             nl = merge(stripe_width, last_stripe_width, i<stripe_count)
-             noff = (i-1)*stripe_width
-             row(noff+1:noff+nl) = a(1:nl,n,i)
-           enddo
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-           call timer%stop("pack_row_complex_cpu_openmp")
-#else
-           call timer%stop("pack_row_complex_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-         end subroutine pack_row_complex_cpu_openmp
-#else
-         end subroutine pack_row_complex_cpu
-#endif
-
-#ifdef WITH_OPENMP
-         subroutine unpack_row_complex_cpu_openmp(a, row, n, my_thread, stripe_count, thread_width, stripe_width, l_nev)
-#ifdef HAVE_DETAILED_TIMINGS
-           use timings
-#endif
-           use precision
-           implicit none
-
-           ! Private variables in OMP regions (my_thread) should better be in the argument list!
-           integer(kind=ik), intent(in)  :: n, my_thread
-           integer(kind=ik), intent(in)  :: stripe_count, thread_width, stripe_width, l_nev
-           complex(kind=ck), intent(in)  :: row(:)
-           complex(kind=ck)              :: a(:,:,:,:)
-           integer(kind=ik)              :: i, noff, nl
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%start("unpack_row_complex_cpu_openmp")
-#endif
-
-           do i=1,stripe_count
-             noff = (my_thread-1)*thread_width + (i-1)*stripe_width
-             nl   = min(stripe_width, my_thread*thread_width-noff, l_nev-noff)
-             if (nl<=0) exit
-             a(1:nl,n,i,my_thread) = row(noff+1:noff+nl)
-           enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%stop("unpack_row_complex_cpu_openmp")
-#endif
-         end subroutine unpack_row_complex_cpu_openmp
-#else /* WITH_OPENMP */
-
-         subroutine unpack_row_complex_cpu(a, row, n, stripe_count, stripe_width, last_stripe_width)
-#ifdef HAVE_DETAILED_TIMINGS
-           use timings
-#endif
-           use precision
-           implicit none
-           integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, n
-           complex(kind=ck), intent(in) :: row(:)
-           complex(kind=ck)             :: a(:,:,:)
-           integer(kind=ik)             :: i, noff, nl
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%start("unpack_row_complex_cpu")
-#endif
-           do i=1,stripe_count
-             nl = merge(stripe_width, last_stripe_width, i<stripe_count)
-             noff = (i-1)*stripe_width
-             a(1:nl,n,i) = row(noff+1:noff+nl)
-           enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-           call timer%stop("unpack_row_complex_cpu")
-#endif
-
-         end  subroutine unpack_row_complex_cpu
-#endif /* WITH_OPENMP */
-
-end module
diff -Nru elpa-2016.05.001/src/mod_pack_unpack_real.F90 elpa-2019.11.001/src/mod_pack_unpack_real.F90
--- elpa-2016.05.001/src/mod_pack_unpack_real.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/mod_pack_unpack_real.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,182 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MPCDF
-
-module pack_unpack_real
-#include "config-f90.h"
-  implicit none
-
-#ifdef WITH_OPENMP
-  public pack_row_real_cpu_openmp, unpack_row_real_cpu_openmp
-#else
-  public pack_row_real_cpu, unpack_row_real_cpu
-#endif
-  contains
-
-#ifdef WITH_OPENMP
-        subroutine pack_row_real_cpu_openmp(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev)
-#else
-        subroutine pack_row_real_cpu(a, row, n, stripe_width, last_stripe_width, stripe_count)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-          use timings
-#endif
-          use precision
-          implicit none
-          integer(kind=ik), intent(in) :: n, stripe_count, stripe_width
-#ifdef WITH_OPENMP
-          integer(kind=ik), intent(in) :: max_threads, thread_width, l_nev
-          real(kind=rk), intent(in)    :: a(:,:,:,:)
-#else
-          integer(kind=ik), intent(in) :: last_stripe_width
-          real(kind=rk), intent(in)    :: a(:,:,:)
-#endif
-          real(kind=rk)                :: row(:)
-
-          integer(kind=ik)             :: i, noff, nl
-#ifdef WITH_OPENMP
-          integer(kind=ik)             :: nt
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-          call timer%start("pack_row_real_cpu_openmp")
-
-#else
-          call timer%start("pack_row_real_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-          do nt = 1, max_threads
-            do i = 1, stripe_count
-              noff = (nt-1)*thread_width + (i-1)*stripe_width
-              nl   = min(stripe_width, nt*thread_width-noff, l_nev-noff)
-              if (nl<=0) exit
-              row(noff+1:noff+nl) = a(1:nl,n,i,nt)
-            enddo
-          enddo
-#else
-          do i=1,stripe_count
-            nl = merge(stripe_width, last_stripe_width, i<stripe_count)
-            noff = (i-1)*stripe_width
-            row(noff+1:noff+nl) = a(1:nl,n,i)
-          enddo
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#ifdef WITH_OPENMP
-          call timer%stop("pack_row_real_cpu_openmp")
-
-#else
-          call timer%stop("pack_row_real_cpu")
-#endif
-#endif
-
-#ifdef WITH_OPENMP
-        end subroutine pack_row_real_cpu_openmp
-#else
-        end subroutine pack_row_real_cpu
-#endif
-
-#ifdef WITH_OPENMP
-        subroutine unpack_row_real_cpu_openmp(a, row, n, my_thread, stripe_count, thread_width, stripe_width, l_nev)
-#ifdef HAVE_DETAILED_TIMINGS
-          use timings
-#endif
-          use precision
-          implicit none
-
-          ! Private variables in OMP regions (my_thread) should better be in the argument list!
-          integer(kind=ik), intent(in) :: stripe_count, thread_width, stripe_width, l_nev
-          real(kind=rk)                :: a(:,:,:,:)
-          integer(kind=ik), intent(in) :: n, my_thread
-          real(kind=rk), intent(in)    :: row(:)
-          integer(kind=ik)             :: i, noff, nl
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%start("unpack_row_real_cpu_openmp")
-#endif
-          do i=1,stripe_count
-            noff = (my_thread-1)*thread_width + (i-1)*stripe_width
-            nl   = min(stripe_width, my_thread*thread_width-noff, l_nev-noff)
-            if(nl<=0) exit
-            a(1:nl,n,i,my_thread) = row(noff+1:noff+nl)
-          enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("unpack_row_real_cpu_openmp")
-#endif
-
-        end subroutine unpack_row_real_cpu_openmp
-
-#else /* WITH_OPENMP */
-        subroutine unpack_row_real_cpu(a, row, n, stripe_count, stripe_width, last_stripe_width)
-#ifdef HAVE_DETAILED_TIMINGS
-          use timings
-#endif
-         use precision
-         implicit none
-
-         integer(kind=ik), intent(in) :: n, stripe_count, stripe_width, last_stripe_width
-         real(kind=rk)                :: row(:)
-         real(kind=rk)                :: a(:,:,:)
-         integer(kind=ik)             :: i, noff, nl
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%start("unpack_row_real_cpu")
-#endif
-
-          do i=1,stripe_count
-            nl = merge(stripe_width, last_stripe_width, i<stripe_count)
-            noff = (i-1)*stripe_width
-            a(1:nl,n,i) = row(noff+1:noff+nl)
-          enddo
-
-#ifdef HAVE_DETAILED_TIMINGS
-          call timer%stop("unpack_row_real_cpu")
-#endif
-        end subroutine unpack_row_real_cpu
-#endif /* WITH_OPENMP */
-
-end module
diff -Nru elpa-2016.05.001/src/mod_precision.f90 elpa-2019.11.001/src/mod_precision.f90
--- elpa-2016.05.001/src/mod_precision.f90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/mod_precision.f90	1970-01-01 00:00:00.000000000 +0000
@@ -1,51 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MPCDF
-module precision
-  use iso_c_binding, only : C_FLOAT, C_DOUBLE, C_INT32_T, C_INT64_T
-
-  implicit none
-  integer, parameter :: rk  = C_DOUBLE
-  integer, parameter :: ck  = C_DOUBLE
-  integer, parameter :: ik  = C_INT32_T
-  integer, parameter :: lik = C_INT64_T
-end module precision
diff -Nru elpa-2016.05.001/src/mod_time_c.F90 elpa-2019.11.001/src/mod_time_c.F90
--- elpa-2016.05.001/src/mod_time_c.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/src/mod_time_c.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,67 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Lorenz Huedepohl, MPCDF
-
-#include "config-f90.h"
-
-module time_c
-
-  use precision
-  use, intrinsic :: iso_c_binding
-
-  interface
-    function microseconds_since_epoch() result(ms) bind(C, name="ftimings_microseconds_since_epoch")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_INT64_T) :: ms
-    end function
-  end interface
-
-  interface
-    function seconds() result(s) bind(C, name="seconds")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      real(kind=C_DOUBLE) :: s
-    end function
-  end interface
-
-end module time_c
diff -Nru elpa-2016.05.001/src/redist_band.X90 elpa-2019.11.001/src/redist_band.X90
--- elpa-2016.05.001/src/redist_band.X90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/redist_band.X90	1970-01-01 00:00:00.000000000 +0000
@@ -1,327 +0,0 @@
-#if 0
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-! Author: Andreas Marek, MPCDF
-#endif
-! --------------------------------------------------------------------------------------------------
-! redist_band: redistributes band from 2D block cyclic form to 1D band
-#if REALCASE==1
-subroutine redist_band_real(r_a, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, r_ab)
-#endif
-
-#if COMPLEXCASE==1
-subroutine redist_band_complex(c_a, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, c_ab)
-#endif
-
-
-
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   use precision
-   implicit none
-
-   integer(kind=ik), intent(in)     :: lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
-#if REALCASE==1
-   real(kind=rk), intent(in)        :: r_a(lda, matrixCols)
-#endif
-#if COMPLEXCASE==1
-   complex(kind=ck), intent(in)     :: c_a(lda, matrixCols)
-#endif
-
-
-#if REALCASE==1
-   real(kind=rk), intent(out)       :: r_ab(:,:)
-#endif
-
-#if COMPLEXCASE==1
-   complex(kind=ck), intent(out)    :: c_ab(:,:)
-#endif
-
-   integer(kind=ik), allocatable    :: ncnt_s(:), nstart_s(:), ncnt_r(:), nstart_r(:), &
-                                       global_id(:,:), global_id_tmp(:,:), block_limits(:)
-#if REALCASE==1
-   real(kind=rk), allocatable       :: r_sbuf(:,:,:), r_rbuf(:,:,:), r_buf(:,:)
-#endif
-
-#if COMPLEXCASE==1
-   complex(kind=ck), allocatable    :: c_sbuf(:,:,:), c_rbuf(:,:,:), c_buf(:,:)
-#endif
-   integer(kind=ik)                 :: i, j, my_pe, n_pes, my_prow, np_rows, my_pcol, np_cols, &
-                                       nfact, np, npr, npc, mpierr, is, js
-   integer(kind=ik)                 :: nblocks_total, il, jl, l_rows, l_cols, n_off
-
-#ifdef HAVE_DETAILED_TIMINGS
-#if REALCASE==1
-        call timer%start("redist_band_real")
-#endif
-#if COMPLEXCASE==1
-        call timer%start("redist_band_complex")
-#endif
-
-#endif
-   call mpi_comm_rank(mpi_comm,my_pe,mpierr)
-   call mpi_comm_size(mpi_comm,n_pes,mpierr)
-
-   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
-   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
-   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
-   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
-   ! Get global_id mapping 2D procssor coordinates to global id
-
-   allocate(global_id(0:np_rows-1,0:np_cols-1))
-#ifdef WITH_OPENMP
-   allocate(global_id_tmp(0:np_rows-1,0:np_cols-1))
-#endif
-   global_id(:,:) = 0
-   global_id(my_prow, my_pcol) = my_pe
-#ifdef WITH_MPI
-#ifdef WITH_OPENMP
-   global_id_tmp(:,:) = global_id(:,:)
-   call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-   deallocate(global_id_tmp)
-#else
-   call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
-#endif
-#endif
-   ! Set work distribution
-
-   nblocks_total = (na-1)/nbw + 1
-
-   allocate(block_limits(0:n_pes))
-   call divide_band(nblocks_total, n_pes, block_limits)
-
-
-   allocate(ncnt_s(0:n_pes-1))
-   allocate(nstart_s(0:n_pes-1))
-   allocate(ncnt_r(0:n_pes-1))
-   allocate(nstart_r(0:n_pes-1))
-
-
-   nfact = nbw/nblk
-
-   ! Count how many blocks go to which PE
-
-   ncnt_s(:) = 0
-   np = 0 ! receiver PE number
-   do j=0,(na-1)/nblk ! loop over rows of blocks
-     if (j/nfact==block_limits(np+1)) np = np+1
-     if (mod(j,np_rows) == my_prow) then
-       do i=0,nfact
-         if (mod(i+j,np_cols) == my_pcol) then
-           ncnt_s(np) = ncnt_s(np) + 1
-         endif
-       enddo
-     endif
-   enddo
-
-   ! Allocate send buffer
-
-#if REALCASE==1
-   allocate(r_sbuf(nblk,nblk,sum(ncnt_s)))
-   r_sbuf(:,:,:) = 0.
-#endif
-#if COMPLEXCASE==1
-   allocate(c_sbuf(nblk,nblk,sum(ncnt_s)))
-   c_sbuf(:,:,:) = 0.
-#endif
-
-   ! Determine start offsets in send buffer
-
-   nstart_s(0) = 0
-   do i=1,n_pes-1
-     nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1)
-   enddo
-
-   ! Fill send buffer
-
-   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
-   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of a
-
-   np = 0
-   do j=0,(na-1)/nblk ! loop over rows of blocks
-     if (j/nfact==block_limits(np+1)) np = np+1
-     if (mod(j,np_rows) == my_prow) then
-       do i=0,nfact
-         if (mod(i+j,np_cols) == my_pcol) then
-           nstart_s(np) = nstart_s(np) + 1
-           js = (j/np_rows)*nblk
-           is = ((i+j)/np_cols)*nblk
-           jl = MIN(nblk,l_rows-js)
-           il = MIN(nblk,l_cols-is)
-
-#if REALCASE==1
-           r_sbuf(1:jl,1:il,nstart_s(np)) = r_a(js+1:js+jl,is+1:is+il)
-#endif
-#if COMPLEXCASE==1
-           c_sbuf(1:jl,1:il,nstart_s(np)) = c_a(js+1:js+jl,is+1:is+il)
-#endif
-         endif
-       enddo
-      endif
-   enddo
-
-   ! Count how many blocks we get from which PE
-
-   ncnt_r(:) = 0
-   do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk)
-     npr = mod(j,np_rows)
-     do i=0,nfact
-       npc = mod(i+j,np_cols)
-       np = global_id(npr,npc)
-       ncnt_r(np) = ncnt_r(np) + 1
-     enddo
-   enddo
-
-   ! Allocate receive buffer
-
-#if REALCASE==1
-   allocate(r_rbuf(nblk,nblk,sum(ncnt_r)))
-#endif
-#if COMPLEXCASE==1
-   allocate(c_rbuf(nblk,nblk,sum(ncnt_r)))
-#endif
-
-   ! Set send counts/send offsets, receive counts/receive offsets
-   ! now actually in variables, not in blocks
-
-   ncnt_s(:) = ncnt_s(:)*nblk*nblk
-
-   nstart_s(0) = 0
-   do i=1,n_pes-1
-     nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1)
-   enddo
-
-   ncnt_r(:) = ncnt_r(:)*nblk*nblk
-
-   nstart_r(0) = 0
-   do i=1,n_pes-1
-     nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1)
-   enddo
-
-   ! Exchange all data with MPI_Alltoallv
-#ifdef WITH_MPI
-
-#if REALCASE==1
-    call MPI_Alltoallv(r_sbuf,ncnt_s,nstart_s,MPI_REAL8,r_rbuf,ncnt_r,nstart_r,MPI_REAL8,mpi_comm,mpierr)
-#endif
-#if COMPLEXCASE==1
-    call MPI_Alltoallv(c_sbuf,ncnt_s,nstart_s,MPI_COMPLEX16,c_rbuf,ncnt_r,nstart_r,MPI_COMPLEX16,mpi_comm,mpierr)
-#endif
-
-#else /* WITH_MPI */
-
-#if REALCASE==1
-    r_rbuf = r_sbuf
-#endif
-
-#if COMPLEXCASE==1
-    c_rbuf = c_sbuf
-#endif
-#endif /* WITH_MPI */
-   ! set band from receive buffer
-
-   ncnt_r(:) = ncnt_r(:)/(nblk*nblk)
-
-   nstart_r(0) = 0
-   do i=1,n_pes-1
-     nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1)
-   enddo
-
-#if REALCASE==1
-   allocate(r_buf((nfact+1)*nblk,nblk))
-#endif
-#if COMPLEXCASE==1
-   allocate(c_buf((nfact+1)*nblk,nblk))
-#endif
-
-   ! n_off: Offset of ab within band
-   n_off = block_limits(my_pe)*nbw
-
-   do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk)
-     npr = mod(j,np_rows)
-     do i=0,nfact
-       npc = mod(i+j,np_cols)
-       np = global_id(npr,npc)
-       nstart_r(np) = nstart_r(np) + 1
-#if REALCASE==1
-       r_buf(i*nblk+1:i*nblk+nblk,:) = transpose(r_rbuf(:,:,nstart_r(np)))
-#endif
-#if COMPLEXCASE==1
-       c_buf(i*nblk+1:i*nblk+nblk,:) = conjg(transpose(c_rbuf(:,:,nstart_r(np))))
-#endif
-     enddo
-     do i=1,MIN(nblk,na-j*nblk)
-#if REALCASE==1
-       r_ab(1:nbw+1,i+j*nblk-n_off) = r_buf(i:i+nbw,i)
-#endif
-#if COMPLEXCASE==1
-       c_ab(1:nbw+1,i+j*nblk-n_off) = c_buf(i:i+nbw,i)
-#endif
-     enddo
-   enddo
-
-   deallocate(ncnt_s, nstart_s)
-   deallocate(ncnt_r, nstart_r)
-   deallocate(global_id)
-   deallocate(block_limits)
-
-#if REALCASE==1
-   deallocate(r_sbuf, r_rbuf, r_buf)
-#endif
-#if COMPLEXCASE==1
-   deallocate(c_sbuf, c_rbuf, c_buf)
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-#if REALCASE==1
-   call timer%stop("redist_band_real")
-#endif
-#if COMPLEXCASE==1
-   call timer%stop("redist_band_complex")
-#endif
-
-#endif
-
-
-end subroutine
-
diff -Nru elpa-2016.05.001/src/timer.F90 elpa-2019.11.001/src/timer.F90
--- elpa-2016.05.001/src/timer.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/src/timer.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,9 +0,0 @@
-module timings
-  use ftimings
-
-  implicit none
-
-  type(timer_t) :: timer
-
-
-end module timings
diff -Nru elpa-2016.05.001/test/C/test_autotune.c elpa-2019.11.001/test/C/test_autotune.c
--- elpa-2016.05.001/test/C/test_autotune.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/C/test_autotune.c	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,335 @@
+/*   This file is part of ELPA.
+
+     The ELPA library was originally created by the ELPA consortium,
+     consisting of the following organizations:
+
+     - Max Planck Computing and Data Facility (MPCDF), formerly known as
+       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+     - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+       Informatik,
+     - Technische Universität München, Lehrstuhl für Informatik mit
+       Schwerpunkt Wissenschaftliches Rechnen ,
+     - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+     - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+       and
+     - IBM Deutschland GmbH
+
+
+     More information can be found here:
+     http://elpa.mpcdf.mpg.de/
+
+     ELPA is free software: you can redistribute it and/or modify
+     it under the terms of the version 3 of the license of the
+     GNU Lesser General Public License as published by the Free
+     Software Foundation.
+
+     ELPA is distributed in the hope that it will be useful,
+     but WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+     GNU Lesser General Public License for more details.
+
+     You should have received a copy of the GNU Lesser General Public License
+     along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+
+     ELPA reflects a substantial effort on the part of the original
+     ELPA consortium, and we ask you to respect the spirit of the
+     license that we chose: i.e., please contribute any changes you
+     may have back to the original ELPA library distribution, and keep
+     any derivatives of ELPA under the same license that we chose for
+     the original distribution, the GNU Lesser General Public License.
+*/
+
+#include "config.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+#include <math.h>
+
+#include <elpa/elpa.h>
+#include <assert.h>
+
+#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
+//#error "define exactly one of TEST_REAL or TEST_COMPLEX"
+#endif
+
+#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
+//#error "define exactly one of TEST_SINGLE or TEST_DOUBLE"
+#endif
+
+#if !(defined(TEST_SOLVER_1STAGE) ^ defined(TEST_SOLVER_2STAGE))
+//#error "define exactly one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE"
+#endif
+
+#ifdef TEST_SINGLE
+#  define EV_TYPE float
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE float
+#  else
+#    define MATRIX_TYPE complex float
+#  endif
+#else
+#  define EV_TYPE double
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE double
+#  else
+#    define MATRIX_TYPE complex double
+#  endif
+#endif
+
+#define assert_elpa_ok(x) assert(x == ELPA_OK)
+
+#ifdef HAVE_64BIT_INTEGER_SUPPORT
+#define TEST_C_INT_TYPE_PTR long int*
+#define C_INT_TYPE_PTR long int*
+#define TEST_C_INT_TYPE long int
+#define C_INT_TYPE long int
+#else
+#define TEST_C_INT_TYPE_PTR int*
+#define C_INT_TYPE_PTR int*
+#define TEST_C_INT_TYPE int
+#define C_INT_TYPE int
+#endif
+
+#include "test/shared/generated.h"
+
+int main(int argc, char** argv) {
+   /* matrix dimensions */
+   C_INT_TYPE na, nev, nblk;
+
+   /* mpi */
+   C_INT_TYPE myid, nprocs;
+   C_INT_TYPE na_cols, na_rows;
+   C_INT_TYPE np_cols, np_rows;
+   C_INT_TYPE my_prow, my_pcol;
+   C_INT_TYPE mpi_comm;
+
+   /* blacs */
+   C_INT_TYPE my_blacs_ctxt, sc_desc[9], info;
+
+   /* The Matrix */
+   MATRIX_TYPE *a, *as, *z;
+   EV_TYPE *ev;
+
+   C_INT_TYPE status;
+   int error_elpa; 
+   elpa_t handle;
+
+   elpa_autotune_t autotune_handle;
+   C_INT_TYPE i, unfinished;
+
+   C_INT_TYPE value;
+#ifdef WITH_MPI
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+#else
+   nprocs = 1;
+   myid = 0;
+#endif
+
+   if (argc == 4) {
+     na = atoi(argv[1]);
+     nev = atoi(argv[2]);
+     nblk = atoi(argv[3]);
+   } else {
+     na = 500;
+     nev = 250;
+     nblk = 16;
+   }
+
+   for (np_cols = (C_INT_TYPE) sqrt((double) nprocs); np_cols > 1; np_cols--) {
+     if (nprocs % np_cols == 0) {
+       break;
+     }
+   }
+
+   np_rows = nprocs/np_cols;
+
+   /* set up blacs */
+   /* convert communicators before */
+#ifdef WITH_MPI
+   mpi_comm = MPI_Comm_c2f(MPI_COMM_WORLD);
+#else
+   mpi_comm = 0;
+#endif
+   set_up_blacsgrid_f(mpi_comm, np_rows, np_cols, 'C', &my_blacs_ctxt, &my_prow, &my_pcol);
+   set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
+
+   /* allocate the matrices needed for elpa */
+   a  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   z  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   as = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   ev = calloc(na, sizeof(EV_TYPE));
+
+#ifdef TEST_REAL
+#ifdef TEST_DOUBLE
+   prepare_matrix_random_real_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+#else
+   prepare_matrix_random_real_single_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+#endif
+#else
+#ifdef TEST_DOUBLE
+   prepare_matrix_random_complex_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+#else
+   prepare_matrix_random_complex_single_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+#endif
+#endif
+
+   if (elpa_init(CURRENT_API_VERSION) != ELPA_OK) {
+     fprintf(stderr, "Error: ELPA API version not supported");
+     exit(1);
+   }
+
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+   handle = elpa_allocate();
+#else
+   handle = elpa_allocate(&error_elpa);
+   assert_elpa_ok(error_elpa);
+#endif
+   assert_elpa_ok(error_elpa);
+
+   /* Set parameters */
+   elpa_set(handle, "na", (int) na, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "nev", (int) nev, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   if (myid == 0) {
+     printf("Setting the matrix parameters na=%d, nev=%d \n",na,nev);
+   }
+   elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "nblk", (int) nblk, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+#ifdef WITH_MPI
+   elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(MPI_COMM_WORLD)), &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "process_row", (int) my_prow, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "process_col", (int) my_pcol, &error_elpa);
+   assert_elpa_ok(error_elpa);
+#endif
+
+   /* Setup */
+   assert_elpa_ok(elpa_setup(handle));
+
+   elpa_set(handle, "gpu", 0, &error_elpa);
+   assert_elpa_ok(error_elpa);
+ 
+   autotune_handle = elpa_autotune_setup(handle, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, &error_elpa);
+   assert_elpa_ok(error_elpa);
+   /* mimic 20 scf steps */
+
+   for (i=0; i < 20; i++) {
+
+      unfinished = elpa_autotune_step(handle, autotune_handle, &error_elpa);
+
+      if (unfinished == 0) {
+        if (myid == 0) {
+       	  printf("ELPA autotuning finished in the %d th scf step \n",i);
+        }
+	break;
+      }
+      if (myid == 0) {
+	printf("The current setting of the ELPA object: \n");
+        elpa_print_settings(handle, &error_elpa);
+
+	printf("The state of the autotuning: \n");
+        elpa_autotune_print_state(handle, autotune_handle, &error_elpa);
+      }
+
+
+      /* Solve EV problem */
+      elpa_eigenvectors(handle, a, ev, z, &error_elpa);
+      assert_elpa_ok(error_elpa);
+
+      /* check the results */
+#ifdef TEST_REAL
+#ifdef TEST_DOUBLE
+      status = check_correctness_evp_numeric_residuals_real_double_f(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+      memcpy(a, as, na_rows*na_cols*sizeof(double));
+
+#else
+      status = check_correctness_evp_numeric_residuals_real_single_f(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+      memcpy(a, as, na_rows*na_cols*sizeof(float));
+#endif
+#else
+#ifdef TEST_DOUBLE
+      status = check_correctness_evp_numeric_residuals_complex_double_f(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+      memcpy(a, as, na_rows*na_cols*sizeof(complex double));
+#else
+      status = check_correctness_evp_numeric_residuals_complex_single_f(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+      memcpy(a, as, na_rows*na_cols*sizeof(complex float));
+#endif
+#endif
+
+      if (status !=0){
+        printf("The computed EVs are not correct !\n");
+	break;
+      }
+      printf("hier %d \n",myid);
+   }
+
+   if (unfinished == 1) {
+     if (myid == 0) {
+        printf("ELPA autotuning did not finished during %d scf cycles\n",i);
+
+     }	     
+
+   }
+   elpa_autotune_set_best(handle, autotune_handle, &error_elpa);
+
+   if (myid == 0) {
+     printf("The best combination found by the autotuning:\n");
+     elpa_autotune_print_best(handle, autotune_handle, &error_elpa);
+   }
+
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+   elpa_autotune_deallocate(autotune_handle);
+   elpa_deallocate(handle);
+#else
+   elpa_autotune_deallocate(autotune_handle, &error_elpa);
+   elpa_deallocate(handle, &error_elpa);
+#endif
+   elpa_uninit(&error_elpa);
+
+   if (myid == 0) {
+     printf("\n");
+     printf("2stage ELPA real solver complete\n");
+     printf("\n");
+   }
+
+   if (status ==0){
+     if (myid ==0) {
+       printf("All ok!\n");
+     }
+   }
+
+   free(a);
+   free(z);
+   free(as);
+   free(ev);
+
+#ifdef WITH_MPI
+   MPI_Finalize();
+#endif
+
+   return !!status;
+}
diff -Nru elpa-2016.05.001/test/C/test.c elpa-2019.11.001/test/C/test.c
--- elpa-2016.05.001/test/C/test.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/C/test.c	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,339 @@
+/*   This file is part of ELPA.
+
+     The ELPA library was originally created by the ELPA consortium,
+     consisting of the following organizations:
+
+     - Max Planck Computing and Data Facility (MPCDF), formerly known as
+       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+     - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+       Informatik,
+     - Technische Universität München, Lehrstuhl für Informatik mit
+       Schwerpunkt Wissenschaftliches Rechnen ,
+     - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+     - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+       and
+     - IBM Deutschland GmbH
+
+
+     More information can be found here:
+     http://elpa.mpcdf.mpg.de/
+
+     ELPA is free software: you can redistribute it and/or modify
+     it under the terms of the version 3 of the license of the
+     GNU Lesser General Public License as published by the Free
+     Software Foundation.
+
+     ELPA is distributed in the hope that it will be useful,
+     but WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+     GNU Lesser General Public License for more details.
+
+     You should have received a copy of the GNU Lesser General Public License
+     along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+
+     ELPA reflects a substantial effort on the part of the original
+     ELPA consortium, and we ask you to respect the spirit of the
+     license that we chose: i.e., please contribute any changes you
+     may have back to the original ELPA library distribution, and keep
+     any derivatives of ELPA under the same license that we chose for
+     the original distribution, the GNU Lesser General Public License.
+*/
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+#include <math.h>
+
+#include <elpa/elpa.h>
+#include <assert.h>
+
+#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
+#error "define exactly one of TEST_REAL or TEST_COMPLEX"
+#endif
+
+#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
+#error "define exactly one of TEST_SINGLE or TEST_DOUBLE"
+#endif
+
+#if !(defined(TEST_SOLVER_1STAGE) ^ defined(TEST_SOLVER_2STAGE))
+#error "define exactly one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE"
+#endif
+
+#ifdef TEST_GENERALIZED_DECOMP_EIGENPROBLEM
+#define TEST_GENERALIZED_EIGENPROBLEM
+#endif
+
+#ifdef TEST_SINGLE
+#  define EV_TYPE float
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE float
+#    define PREPARE_MATRIX_RANDOM prepare_matrix_random_real_single_f
+#    define PREPARE_MATRIX_RANDOM_SPD prepare_matrix_random_spd_real_single_f
+#    define CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS check_correctness_evp_numeric_residuals_real_single_f
+#    define CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS check_correctness_evp_gen_numeric_residuals_real_single_f
+#  else
+#    define MATRIX_TYPE complex float
+#    define PREPARE_MATRIX_RANDOM prepare_matrix_random_complex_single_f
+#    define PREPARE_MATRIX_RANDOM_SPD prepare_matrix_random_spd_complex_single_f
+#    define CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS check_correctness_evp_numeric_residuals_complex_single_f
+#    define CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS check_correctness_evp_gen_numeric_residuals_complex_single_f
+#  endif
+#else
+#  define EV_TYPE double
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE double
+#    define PREPARE_MATRIX_RANDOM prepare_matrix_random_real_double_f
+#    define PREPARE_MATRIX_RANDOM_SPD prepare_matrix_random_spd_real_double_f
+#    define CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS check_correctness_evp_numeric_residuals_real_double_f
+#    define CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS check_correctness_evp_gen_numeric_residuals_real_double_f
+#  else
+#    define MATRIX_TYPE complex double
+#    define PREPARE_MATRIX_RANDOM prepare_matrix_random_complex_double_f
+#    define PREPARE_MATRIX_RANDOM_SPD prepare_matrix_random_spd_complex_double_f
+#    define CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS check_correctness_evp_numeric_residuals_complex_double_f
+#    define CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS check_correctness_evp_gen_numeric_residuals_complex_double_f
+#  endif
+#endif
+
+#define assert_elpa_ok(x) assert(x == ELPA_OK)
+
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_C_INT_TYPE_PTR long int*
+#define C_INT_TYPE_PTR long int*
+#define TEST_C_INT_TYPE long int
+#define C_INT_TYPE long int
+#else
+#define TEST_C_INT_TYPE_PTR int*
+#define C_INT_TYPE_PTR int*
+#define TEST_C_INT_TYPE int
+#define C_INT_TYPE int
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_C_INT_MPI_TYPE_PTR long int*
+#define C_INT_MPI_TYPE_PTR long int*
+#define TEST_C_INT_MPI_TYPE long int
+#define C_INT_MPI_TYPE long int
+#else
+#define TEST_C_INT_MPI_TYPE_PTR int*
+#define C_INT_MPI_TYPE_PTR int*
+#define TEST_C_INT_MPI_TYPE int
+#define C_INT_MPI_TYPE int
+#endif
+#include "test/shared/generated.h"
+
+int main(int argc, char** argv) {
+   /* matrix dimensions */
+   C_INT_TYPE na, nev, nblk;
+
+   /* mpi */
+   C_INT_TYPE myid, nprocs;
+   C_INT_MPI_TYPE myidMPI, nprocsMPI;
+   C_INT_TYPE na_cols, na_rows;
+   C_INT_TYPE np_cols, np_rows;
+   C_INT_TYPE my_prow, my_pcol;
+   C_INT_TYPE mpi_comm;
+   C_INT_MPI_TYPE provided_mpi_thread_level;
+
+   /* blacs */
+   C_INT_TYPE my_blacs_ctxt, sc_desc[9], info;
+
+   /* The Matrix */
+   MATRIX_TYPE *a, *as, *z, *b, *bs;
+   EV_TYPE *ev;
+
+   C_INT_TYPE error, status;
+   int error_elpa;
+
+   elpa_t handle;
+
+   int  value;
+#ifdef WITH_MPI
+#ifndef WITH_OPENMP
+   MPI_Init(&argc, &argv);
+#else
+   MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided_mpi_thread_level);
+
+   if (provided_mpi_thread_level != MPI_THREAD_MULTIPLE) {
+     fprintf(stderr, "MPI ERROR: MPI_THREAD_MULTIPLE is not provided on this system\n");
+     MPI_Finalize();
+     exit(77);
+   }
+#endif
+
+   MPI_Comm_size(MPI_COMM_WORLD, &nprocsMPI);
+   nprocs = (C_INT_TYPE) nprocsMPI;
+   MPI_Comm_rank(MPI_COMM_WORLD, &myidMPI);
+   myid = (C_INT_TYPE) myidMPI;
+
+#else
+   nprocs = 1;
+   myid = 0;
+#endif
+
+   if (argc == 4) {
+     na = atoi(argv[1]);
+     nev = atoi(argv[2]);
+     nblk = atoi(argv[3]);
+   } else {
+     na = 500;
+     nev = 250;
+     nblk = 16;
+   }
+
+   for (np_cols = (C_INT_TYPE) sqrt((double) nprocs); np_cols > 1; np_cols--) {
+     if (nprocs % np_cols == 0) {
+       break;
+     }
+   }
+
+   np_rows = nprocs/np_cols;
+
+   /* set up blacs */
+   /* convert communicators before */
+#ifdef WITH_MPI
+   mpi_comm = MPI_Comm_c2f(MPI_COMM_WORLD);
+#else
+   mpi_comm = 0;
+#endif
+   set_up_blacsgrid_f(mpi_comm, np_rows, np_cols, 'C', &my_blacs_ctxt, &my_prow, &my_pcol);
+   set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
+
+   /* allocate the matrices needed for elpa */
+   a  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   z  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   as = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   ev = calloc(na, sizeof(EV_TYPE));
+
+   PREPARE_MATRIX_RANDOM(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+
+#if defined(TEST_GENERALIZED_EIGENPROBLEM)
+   b  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   bs = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   PREPARE_MATRIX_RANDOM_SPD(na, myid, na_rows, na_cols, sc_desc, b, z, bs, nblk, np_rows, np_cols, my_prow, my_pcol);
+#endif
+
+   if (elpa_init(CURRENT_API_VERSION) != ELPA_OK) {
+     fprintf(stderr, "Error: ELPA API version not supported");
+     exit(1);
+   }
+
+   handle = elpa_allocate(&error_elpa);
+   //assert_elpa_ok(error_elpa);
+
+   /* Set parameters */
+   elpa_set(handle, "na", (int) na, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "nev", (int) nev, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   if (myid == 0) {
+     printf("Setting the matrix parameters na=%d, nev=%d \n",na,nev);
+   }
+   elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "nblk", (int) nblk, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+#ifdef WITH_MPI
+   elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(MPI_COMM_WORLD)), &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "process_row", (int) my_prow, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "process_col", (int) my_pcol, &error_elpa);
+   assert_elpa_ok(error_elpa);
+#endif
+#ifdef TEST_GENERALIZED_EIGENPROBLEM
+   elpa_set(handle, "blacs_context", (int) my_blacs_ctxt, &error_elpa);
+   assert_elpa_ok(error_elpa);
+#endif
+
+   /* Setup */
+   assert_elpa_ok(elpa_setup(handle));
+
+   /* Set tunables */
+#ifdef TEST_SOLVER_1STAGE
+   elpa_set(handle, "solver", ELPA_SOLVER_1STAGE, &error_elpa);
+#else
+   elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error_elpa);
+#endif
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(handle, "gpu", TEST_GPU, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+#if defined(TEST_SOLVE_2STAGE) && defined(TEST_KERNEL)
+# ifdef TEST_COMPLEX
+   elpa_set(handle, "complex_kernel", TEST_KERNEL, &error_elpa);
+# else
+   elpa_set(handle, "real_kernel", TEST_KERNEL, &error_elpa);
+# endif
+   assert_elpa_ok(error_elpa);
+#endif
+
+   elpa_get(handle, "solver", &value, &error_elpa);
+   if (myid == 0) {
+     printf("Solver is set to %d \n", value);
+   }
+
+#if defined(TEST_GENERALIZED_EIGENPROBLEM)
+     elpa_generalized_eigenvectors(handle, a, b, ev, z, 0, &error_elpa);
+#if defined(TEST_GENERALIZED_DECOMP_EIGENPROBLEM)
+     //a = as, so that the problem can be solved again
+     memcpy(a, as, na_rows * na_cols * sizeof(MATRIX_TYPE));
+     elpa_generalized_eigenvectors(handle, a, b, ev, z, 1, &error_elpa);
+#endif
+#else
+   /* Solve EV problem */
+   elpa_eigenvectors(handle, a, ev, z, &error_elpa);
+#endif
+   assert_elpa_ok(error_elpa);
+
+   elpa_deallocate(handle, &error_elpa);
+   elpa_uninit(&error_elpa);
+
+   /* check the results */
+#if defined(TEST_GENERALIZED_EIGENPROBLEM)
+   status = CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol, bs);
+#else
+   status = CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+#endif
+
+   if (status !=0){
+     printf("The computed EVs are not correct !\n");
+   }
+   if (status ==0){
+     printf("All ok!\n");
+   }
+
+   free(a);
+   free(z);
+   free(as);
+   free(ev);
+#if defined(TEST_GENERALIZED_EIGENPROBLEM)
+   free(b);
+   free(bs);
+#endif
+
+#ifdef WITH_MPI
+   MPI_Finalize();
+#endif
+
+   return !!status;
+}
diff -Nru elpa-2016.05.001/test/C/test_multiple_objs.c elpa-2019.11.001/test/C/test_multiple_objs.c
--- elpa-2016.05.001/test/C/test_multiple_objs.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/C/test_multiple_objs.c	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,387 @@
+/*   This file is part of ELPA.
+
+     The ELPA library was originally created by the ELPA consortium,
+     consisting of the following organizations:
+
+     - Max Planck Computing and Data Facility (MPCDF), formerly known as
+       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+     - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+       Informatik,
+     - Technische Universität München, Lehrstuhl für Informatik mit
+       Schwerpunkt Wissenschaftliches Rechnen ,
+     - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+     - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+       and
+     - IBM Deutschland GmbH
+
+
+     More information can be found here:
+     http://elpa.mpcdf.mpg.de/
+
+     ELPA is free software: you can redistribute it and/or modify
+     it under the terms of the version 3 of the license of the
+     GNU Lesser General Public License as published by the Free
+     Software Foundation.
+
+     ELPA is distributed in the hope that it will be useful,
+     but WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+     GNU Lesser General Public License for more details.
+
+     You should have received a copy of the GNU Lesser General Public License
+     along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+
+     ELPA reflects a substantial effort on the part of the original
+     ELPA consortium, and we ask you to respect the spirit of the
+     license that we chose: i.e., please contribute any changes you
+     may have back to the original ELPA library distribution, and keep
+     any derivatives of ELPA under the same license that we chose for
+     the original distribution, the GNU Lesser General Public License.
+*/
+
+#include "config.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+#include <math.h>
+
+#include <elpa/elpa.h>
+#include <assert.h>
+
+#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
+//#error "define exactly one of TEST_REAL or TEST_COMPLEX"
+#endif
+
+#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
+//#error "define exactly one of TEST_SINGLE or TEST_DOUBLE"
+#endif
+
+#if !(defined(TEST_SOLVER_1STAGE) ^ defined(TEST_SOLVER_2STAGE))
+//#error "define exactly one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE"
+#endif
+
+#ifdef TEST_SINGLE
+#  define EV_TYPE float
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE float
+#  else
+#    define MATRIX_TYPE complex float
+#  endif
+#else
+#  define EV_TYPE double
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE double
+#  else
+#    define MATRIX_TYPE complex double
+#  endif
+#endif
+
+#define assert_elpa_ok(x) assert(x == ELPA_OK)
+#ifdef HAVE_64BIT_INTEGER_SUPPORT
+#define TEST_C_INT_TYPE_PTR long int*
+#define C_INT_TYPE_PTR long int*
+#define TEST_C_INT_TYPE long int
+#define C_INT_TYPE long int
+#else
+#define TEST_C_INT_TYPE_PTR int*
+#define C_INT_TYPE_PTR int*
+#define TEST_C_INT_TYPE int
+#define C_INT_TYPE int
+#endif
+
+#include "test/shared/generated.h"
+void set_basic_parameters(elpa_t *handle, C_INT_TYPE na, C_INT_TYPE nev, C_INT_TYPE na_rows, C_INT_TYPE na_cols, C_INT_TYPE nblk, C_INT_TYPE my_prow, C_INT_TYPE my_pcol){
+   int error_elpa;
+   elpa_set(*handle, "na", (int) na, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(*handle, "nev", (int) nev, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(*handle, "local_nrows", (int) na_rows, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(*handle, "local_ncols", (int) na_cols, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(*handle, "nblk", (int) nblk, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+#ifdef WITH_MPI
+   elpa_set(*handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(MPI_COMM_WORLD)), &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(*handle, "process_row", (int) my_prow, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(*handle, "process_col", (int) my_pcol, &error_elpa);
+   assert_elpa_ok(error_elpa);
+#endif
+}
+
+
+int main(int argc, char** argv) {
+   /* matrix dimensions */
+   C_INT_TYPE na, nev, nblk;
+
+   /* mpi */
+   C_INT_TYPE myid, nprocs;
+   C_INT_TYPE na_cols, na_rows;
+   C_INT_TYPE np_cols, np_rows;
+   C_INT_TYPE my_prow, my_pcol;
+   C_INT_TYPE mpi_comm;
+
+   /* blacs */
+   C_INT_TYPE my_blacs_ctxt, sc_desc[9], info;
+
+   /* The Matrix */
+   MATRIX_TYPE *a, *as, *z;
+   EV_TYPE *ev;
+
+   C_INT_TYPE status;
+   int error_elpa;
+   int gpu, timings, debug; 
+   char str[400];
+
+   elpa_t elpa_handle_1, elpa_handle_2, *elpa_handle_ptr;
+
+   elpa_autotune_t autotune_handle;
+   C_INT_TYPE i, unfinished;
+
+   C_INT_TYPE value;
+#ifdef WITH_MPI
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+#else
+   nprocs = 1;
+   myid = 0;
+#endif
+
+   if (argc == 4) {
+     na = atoi(argv[1]);
+     nev = atoi(argv[2]);
+     nblk = atoi(argv[3]);
+   } else {
+     na = 500;
+     nev = 250;
+     nblk = 16;
+   }
+
+   for (np_cols = (C_INT_TYPE) sqrt((double) nprocs); np_cols > 1; np_cols--) {
+     if (nprocs % np_cols == 0) {
+       break;
+     }
+   }
+
+   np_rows = nprocs/np_cols;
+
+   /* set up blacs */
+   /* convert communicators before */
+#ifdef WITH_MPI
+   mpi_comm = MPI_Comm_c2f(MPI_COMM_WORLD);
+#else
+   mpi_comm = 0;
+#endif
+   set_up_blacsgrid_f(mpi_comm, np_rows, np_cols, 'C', &my_blacs_ctxt, &my_prow, &my_pcol);
+   set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
+
+   /* allocate the matrices needed for elpa */
+   a  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   z  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   as = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
+   ev = calloc(na, sizeof(EV_TYPE));
+
+#ifdef TEST_REAL
+#ifdef TEST_DOUBLE
+   prepare_matrix_random_real_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+#else
+   prepare_matrix_random_real_single_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+#endif
+#else
+#ifdef TEST_DOUBLE
+   prepare_matrix_random_complex_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+#else
+   prepare_matrix_random_complex_single_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+#endif
+#endif
+
+   if (elpa_init(CURRENT_API_VERSION) != ELPA_OK) {
+     fprintf(stderr, "Error: ELPA API version not supported");
+     exit(1);
+   }
+
+   elpa_handle_1 = elpa_allocate(&error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   set_basic_parameters(&elpa_handle_1, na, nev, na_rows, na_cols, nblk, my_prow, my_pcol);
+   /* Setup */
+   assert_elpa_ok(elpa_setup(elpa_handle_1));
+
+   elpa_set(elpa_handle_1, "gpu", 0, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(elpa_handle_1, "timings", 1, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_set(elpa_handle_1, "debug", 1, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_store_settings(elpa_handle_1, "initial_parameters.txt", &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+#ifdef WITH_MPI
+     // barrier after store settings, file created from one MPI rank only, but loaded everywhere
+     MPI_Barrier(MPI_COMM_WORLD);
+#endif
+
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+   elpa_handle_2 = elpa_allocate();
+#else
+   elpa_handle_2 = elpa_allocate(&error_elpa);
+   assert_elpa_ok(error_elpa);
+#endif
+
+   set_basic_parameters(&elpa_handle_2, na, nev, na_rows, na_cols, nblk, my_prow, my_pcol);
+   /* Setup */
+   assert_elpa_ok(elpa_setup(elpa_handle_2));
+
+   elpa_load_settings(elpa_handle_2, "initial_parameters.txt", &error_elpa);
+
+   elpa_get(elpa_handle_2, "gpu", &gpu, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_get(elpa_handle_2, "timings", &timings, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   elpa_get(elpa_handle_2, "debug", &debug, &error_elpa);
+   assert_elpa_ok(error_elpa);
+
+   if ((timings != 1) || (debug != 1) || (gpu != 0)){
+     printf("Parameters not stored or loaded correctly. Aborting... %d, %d, %d\n", timings, debug, gpu);
+     exit(1);
+   }
+
+   elpa_handle_ptr = &elpa_handle_2;
+
+   autotune_handle = elpa_autotune_setup(*elpa_handle_ptr, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, &error_elpa);
+   assert_elpa_ok(error_elpa);
+   /* mimic 20 scf steps */
+
+   for (i=0; i < 20; i++) {
+
+      unfinished = elpa_autotune_step(*elpa_handle_ptr, autotune_handle, &error_elpa);
+
+      if (unfinished == 0) {
+        if (myid == 0) {
+          printf("ELPA autotuning finished in the %d th scf step \n",i);
+        }
+        break;
+      }
+
+      elpa_print_settings(*elpa_handle_ptr, &error_elpa);
+      elpa_autotune_print_state(*elpa_handle_ptr, autotune_handle, &error_elpa);
+
+      sprintf(str, "saved_parameters_%d.txt", i);
+      elpa_store_settings(*elpa_handle_ptr, str, &error_elpa);
+      assert_elpa_ok(error_elpa);
+
+      /* Solve EV problem */
+      elpa_eigenvectors(*elpa_handle_ptr, a, ev, z, &error_elpa);
+      assert_elpa_ok(error_elpa);
+
+      /* check the results */
+#ifdef TEST_REAL
+#ifdef TEST_DOUBLE
+      status = check_correctness_evp_numeric_residuals_real_double_f(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+      memcpy(a, as, na_rows*na_cols*sizeof(double));
+
+#else
+      status = check_correctness_evp_numeric_residuals_real_single_f(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+      memcpy(a, as, na_rows*na_cols*sizeof(float));
+#endif
+#else
+#ifdef TEST_DOUBLE
+      status = check_correctness_evp_numeric_residuals_complex_double_f(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+      memcpy(a, as, na_rows*na_cols*sizeof(complex double));
+#else
+      status = check_correctness_evp_numeric_residuals_complex_single_f(na, nev, na_rows, na_cols, as, z, ev,
+                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
+      memcpy(a, as, na_rows*na_cols*sizeof(complex float));
+#endif
+#endif
+
+      if (status !=0){
+        printf("The computed EVs are not correct !\n");
+        break;
+      }
+
+     elpa_autotune_print_state(*elpa_handle_ptr, autotune_handle, &error_elpa);
+     assert_elpa_ok(error_elpa);
+
+     sprintf(str, "saved_state_%d.txt", i);
+     elpa_autotune_save_state(*elpa_handle_ptr, autotune_handle, str, &error_elpa);
+     assert_elpa_ok(error_elpa);
+
+#ifdef WITH_MPI
+     //barrier after save state, file created from one MPI rank only, but loaded everywhere
+     MPI_Barrier(MPI_COMM_WORLD);
+#endif
+
+     elpa_autotune_load_state(*elpa_handle_ptr, autotune_handle, str, &error_elpa);
+     assert_elpa_ok(error_elpa);
+
+     if (unfinished == 1) {
+       if (myid == 0) {
+          printf("ELPA autotuning did not finished during %d scf cycles\n",i);
+       }
+     }
+
+   }
+   elpa_autotune_set_best(*elpa_handle_ptr, autotune_handle, &error_elpa);
+
+   if (myid == 0) {
+     printf("The best combination found by the autotuning:\n");
+     elpa_autotune_print_best(*elpa_handle_ptr, autotune_handle, &error_elpa);
+   }
+
+   elpa_autotune_deallocate(autotune_handle, &error_elpa);
+   elpa_deallocate(elpa_handle_1, &error_elpa);
+#ifdef OPTIONAL_C_ERROR_ARGUMENT
+   elpa_deallocate(elpa_handle_2);
+#else
+   elpa_deallocate(elpa_handle_2, &error_elpa);
+#endif
+   elpa_uninit(&error_elpa);
+
+   if (myid == 0) {
+     printf("\n");
+     printf("2stage ELPA real solver complete\n");
+     printf("\n");
+   }
+
+   if (status ==0){
+     if (myid ==0) {
+       printf("All ok!\n");
+     }
+   }
+
+   free(a);
+   free(z);
+   free(as);
+   free(ev);
+
+#ifdef WITH_MPI
+   MPI_Finalize();
+#endif
+
+   return !!status;
+}
diff -Nru elpa-2016.05.001/test/c_test_programs/elpa1_test_complex_c_version.c elpa-2019.11.001/test/c_test_programs/elpa1_test_complex_c_version.c
--- elpa-2016.05.001/test/c_test_programs/elpa1_test_complex_c_version.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/c_test_programs/elpa1_test_complex_c_version.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,241 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-
-#include "config-f90.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-#include <math.h>
-
-#include <elpa/elpa.h>
-#include <test/shared_sources/generated.h>
-#include <complex.h>
-
-int main(int argc, char** argv) {
-   int myid;
-   int nprocs;
-#ifndef WITH_MPI
-   int MPI_COMM_WORLD;
-#endif
-   int na, nev, nblk;
-
-   int status;
-
-   int np_cols, np_rows, np_colsStart;
-
-   int my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
-
-   int mpierr;
-
-   int my_mpi_comm_world;
-   int mpi_comm_rows, mpi_comm_cols;
-
-   int info, *sc_desc;
-
-   int na_rows, na_cols;
-   double startVal;
-
-   complex double *a, *z, *as, *tmp1, *tmp2;
-
-   double *ev, *xr;
-
-   int *iseed;
-
-   int success;
-#ifdef WITH_MPI
-   MPI_Init(&argc, &argv);
-   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#else
-   nprocs=1;
-   myid=0;
-   MPI_COMM_WORLD=1;
-#endif
-   na = 1000;
-   nev = 500;
-   nblk = 16;
-
-   if (myid == 0) {
-     printf("This is the c version of an ELPA test-programm\n");
-     printf("\n");
-     printf("It will call the 1stage ELPA complex solver for a matrix\n");
-     printf("of matrix size %d. It will compute %d eigenvalues\n",na,nev);
-     printf("and uses a blocksize of %d\n",nblk);
-     printf("\n");
-     printf("This is an example program with much less functionality\n");
-     printf("as it's Fortran counterpart. It's only purpose is to show how \n");
-     printf("to evoke ELPA1 from a c programm\n");
-
-     printf("\n");
-
-   }
-
-   status = 0;
-
-   startVal = sqrt((double) nprocs);
-   np_colsStart = (int) round(startVal);
-   for (np_cols=np_colsStart;np_cols>1;np_cols--){
-     if (nprocs %np_cols ==0){
-     break;
-     }
-   }
-
-   np_rows = nprocs/np_cols;
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Number of processor rows %d, cols %d, total %d \n",np_rows,np_cols,nprocs);
-   }
-
-   /* set up blacs */
-   /* convert communicators before */
-#ifdef WITH_MPI
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-#else
-   my_mpi_comm_world = 1;
-#endif
-   set_up_blacsgrid_from_fortran(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past BLACS_Gridinfo...\n");
-     printf("\n");
-   }
-
-   /* get the ELPA row and col communicators. */
-   /* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
-#ifdef WITH_MPI
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-#endif
-   mpierr = get_elpa_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past split communicator setup for rows and columns...\n");
-     printf("\n");
-   }
-
-   sc_desc = malloc(9*sizeof(int));
-
-   set_up_blacs_descriptor_from_fortran(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past scalapack descriptor setup...\n");
-     printf("\n");
-   }
-
-   /* allocate the matrices needed for elpa */
-   if (myid == 0) {
-     printf("\n");
-     printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols);
-     printf("\n");
-   }
-
-   a  = malloc(na_rows*na_cols*sizeof(complex double));
-   z  = malloc(na_rows*na_cols*sizeof(complex double));
-   as = malloc(na_rows*na_cols*sizeof(complex double));
-
-   xr = malloc(na_rows*na_cols*sizeof(double));
-
-
-   ev = malloc(na*sizeof(double));
-
-   tmp1  = malloc(na_rows*na_cols*sizeof(complex double));
-   tmp2 = malloc(na_rows*na_cols*sizeof(complex double));
-
-   iseed = malloc(4096*sizeof(int));
-
-   prepare_matrix_complex_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as);
-
-   free(xr);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Entering ELPA 1stage complex solver\n");
-     printf("\n");
-   }
-#ifdef WITH_MPI
-   mpierr = MPI_Barrier(MPI_COMM_WORLD);
-#endif
-   success = elpa_solve_evp_complex_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols);
-
-   if (success != 1) {
-     printf("error in ELPA solve \n");
-#ifdef WITH_MPI
-     mpierr = MPI_Abort(MPI_COMM_WORLD, 99);
-#endif
-   }
-
-
-   if (myid == 0) {
-     printf("\n");
-     printf("1stage ELPA complex solver complete\n");
-     printf("\n");
-   }
-
-   /* check the results */
-   status = check_correctness_complex_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2);
-
-   if (status !=0){
-     printf("The computed EVs are not correct !\n");
-   }
-   if (status ==0){
-     printf("All ok!\n");
-   }
-
-   free(sc_desc);
-   free(a);
-   free(z);
-   free(as);
-
-   free(tmp1);
-   free(tmp2);
-#ifdef WITH_MPI
-   MPI_Finalize();
-#endif
-   return 0;
-}
diff -Nru elpa-2016.05.001/test/c_test_programs/elpa1_test_real_c_version.c elpa-2019.11.001/test/c_test_programs/elpa1_test_real_c_version.c
--- elpa-2016.05.001/test/c_test_programs/elpa1_test_real_c_version.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/c_test_programs/elpa1_test_real_c_version.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,234 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-
-#include "config-f90.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-#include <math.h>
-
-#include <elpa/elpa.h>
-
-#include "test/shared_sources/generated.h"
-
-int main(int argc, char** argv) {
-   int myid;
-   int nprocs;
-#ifndef WITH_MPI
-   int MPI_COMM_WORLD;
-#endif
-   int na, nev, nblk;
-
-   int status;
-
-   int np_cols, np_rows, np_colsStart;
-
-   int my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
-
-   int mpierr;
-
-   int my_mpi_comm_world;
-   int mpi_comm_rows, mpi_comm_cols;
-
-   int info, *sc_desc;
-
-   int na_rows, na_cols;
-   double startVal;
-
-   double *a, *z, *as, *ev, *tmp1, *tmp2;
-
-   int *iseed;
-
-   int success;
-#ifdef WITH_MPI
-   MPI_Init(&argc, &argv);
-   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#else
-   nprocs = 1;
-   myid = 0;
-   MPI_COMM_WORLD=1;
-#endif
-   na = 1000;
-   nev = 500;
-   nblk = 16;
-
-   if (myid == 0) {
-     printf("This is the c version of an ELPA test-programm\n");
-     printf("\n");
-     printf("It will call the 1stage ELPA real solver for an\n");
-     printf("of matrix size %d. It will compute %d eigenvalues\n",na,nev);
-     printf("and uses a blocksize of %d\n",nblk);
-     printf("\n");
-     printf("This is an example program with much less functionality\n");
-     printf("as it's Fortran counterpart. It's only purpose is to show how \n");
-     printf("to evoke ELPA1 from a c programm\n");
-     printf("\n");
-
-   }
-
-   status = 0;
-
-   startVal = sqrt((double) nprocs);
-   np_colsStart = (int) round(startVal);
-   for (np_cols=np_colsStart;np_cols>1;np_cols--){
-     if (nprocs %np_cols ==0){
-     break;
-     }
-   }
-
-   np_rows = nprocs/np_cols;
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Number of processor rows %d, cols %d, total %d \n",np_rows,np_cols,nprocs);
-   }
-
-   /* set up blacs */
-   /* convert communicators before */
-#ifdef WITH_MPI
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-#endif
-   set_up_blacsgrid_from_fortran(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past BLACS_Gridinfo...\n");
-     printf("\n");
-   }
-
-   /* get the ELPA row and col communicators. */
-   /* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
-#ifdef WITH_MPI
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-#else
-   my_mpi_comm_world =1 ;
-#endif
-   mpierr = get_elpa_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past split communicator setup for rows and columns...\n");
-     printf("\n");
-   }
-
-   sc_desc = malloc(9*sizeof(int));
-
-   set_up_blacs_descriptor_from_fortran(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past scalapack descriptor setup...\n");
-     printf("\n");
-   }
-
-   /* allocate the matrices needed for elpa */
-   if (myid == 0) {
-     printf("\n");
-     printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols);
-     printf("\n");
-   }
-
-   a  = malloc(na_rows*na_cols*sizeof(double));
-   z  = malloc(na_rows*na_cols*sizeof(double));
-   as = malloc(na_rows*na_cols*sizeof(double));
-
-
-   ev = malloc(na*sizeof(double));
-
-   tmp1  = malloc(na_rows*na_cols*sizeof(double));
-   tmp2 = malloc(na_rows*na_cols*sizeof(double));
-
-   iseed = malloc(4096*sizeof(int));
-
-   prepare_matrix_real_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Entering ELPA 1stage real solver\n");
-     printf("\n");
-   }
-#ifdef WITH_MPI
-   mpierr = MPI_Barrier(MPI_COMM_WORLD);
-#endif
-   success = elpa_solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols);
-
-   if (success != 1) {
-     printf("error in ELPA solve \n");
-#ifdef WITH_MPI
-     mpierr = MPI_Abort(MPI_COMM_WORLD, 99);
-#endif
-   }
-
-
-   if (myid == 0) {
-     printf("\n");
-     printf("1stage ELPA real solver complete\n");
-     printf("\n");
-   }
-
-   /* check the results */
-   status = check_correctness_real_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2);
-
-   if (status !=0){
-     printf("The computed EVs are not correct !\n");
-   }
-   if (status ==0){
-     printf("All ok!\n");
-   }
-
-   free(sc_desc);
-   free(a);
-   free(z);
-   free(as);
-
-   free(tmp1);
-   free(tmp2);
-#ifdef WITH_MPI
-   MPI_Finalize();
-#endif
-   return 0;
-}
diff -Nru elpa-2016.05.001/test/c_test_programs/elpa2_test_complex_c_version.c elpa-2019.11.001/test/c_test_programs/elpa2_test_complex_c_version.c
--- elpa-2016.05.001/test/c_test_programs/elpa2_test_complex_c_version.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/c_test_programs/elpa2_test_complex_c_version.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,246 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-
-#include "config-f90.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-#include <math.h>
-
-#include <elpa/elpa.h>
-#include <test/shared_sources/generated.h>
-#include <complex.h>
-
-int main(int argc, char** argv) {
-   int myid;
-   int nprocs;
-#ifndef WITH_MPI
-   int MPI_COMM_WORLD;
-#endif
-   int na, nev, nblk;
-
-   int status;
-
-   int np_cols, np_rows, np_colsStart;
-
-   int my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
-
-   int mpierr;
-
-   int my_mpi_comm_world;
-   int mpi_comm_rows, mpi_comm_cols;
-
-   int info, *sc_desc;
-
-   int na_rows, na_cols;
-   double startVal;
-
-   complex double *a, *z, *as, *tmp1, *tmp2;
-
-   double *ev, *xr;
-
-   int *iseed;
-
-   int success;
-
-   int THIS_COMPLEX_ELPA_KERNEL_API;
-#ifdef WITH_MPI
-   MPI_Init(&argc, &argv);
-   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#else
-   nprocs = 1;
-   myid =0;
-   MPI_COMM_WORLD=1;
-#endif
-   na = 1000;
-   nev = 500;
-   nblk = 16;
-
-   if (myid == 0) {
-     printf("This is the c version of an ELPA test-programm\n");
-     printf("\n");
-     printf("It will call the 1stage ELPA complex solver for a matrix\n");
-     printf("of matrix size %d. It will compute %d eigenvalues\n",na,nev);
-     printf("and uses a blocksize of %d\n",nblk);
-     printf("\n");
-     printf("This is an example program with much less functionality\n");
-     printf("as it's Fortran counterpart. It's only purpose is to show how \n");
-     printf("to evoke ELPA1 from a c programm\n");
-
-     printf("\n");
-
-   }
-
-   status = 0;
-
-   startVal = sqrt((double) nprocs);
-   np_colsStart = (int) round(startVal);
-   for (np_cols=np_colsStart;np_cols>1;np_cols--){
-     if (nprocs %np_cols ==0){
-     break;
-     }
-   }
-
-   np_rows = nprocs/np_cols;
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Number of processor rows %d, cols %d, total %d \n",np_rows,np_cols,nprocs);
-   }
-
-   /* set up blacs */
-   /* convert communicators before */
-#ifdef WITH_MPI
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-#else
-   my_mpi_comm_world = 1;
-#endif
-   set_up_blacsgrid_from_fortran(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past BLACS_Gridinfo...\n");
-     printf("\n");
-   }
-
-   /* get the ELPA row and col communicators. */
-   /* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
-#ifdef WITH_MPI
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-#endif
-   mpierr = get_elpa_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past split communicator setup for rows and columns...\n");
-     printf("\n");
-   }
-
-   sc_desc = malloc(9*sizeof(int));
-
-   set_up_blacs_descriptor_from_fortran(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past scalapack descriptor setup...\n");
-     printf("\n");
-   }
-
-   /* allocate the matrices needed for elpa */
-   if (myid == 0) {
-     printf("\n");
-     printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols);
-     printf("\n");
-   }
-
-   a  = malloc(na_rows*na_cols*sizeof(complex double));
-   z  = malloc(na_rows*na_cols*sizeof(complex double));
-   as = malloc(na_rows*na_cols*sizeof(complex double));
-
-   xr = malloc(na_rows*na_cols*sizeof(double));
-
-
-   ev = malloc(na*sizeof(double));
-
-   tmp1  = malloc(na_rows*na_cols*sizeof(complex double));
-   tmp2 = malloc(na_rows*na_cols*sizeof(complex double));
-
-   iseed = malloc(4096*sizeof(int));
-
-   prepare_matrix_complex_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as);
-
-   free(xr);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Entering ELPA 2stage complex solver\n");
-     printf("\n");
-   }
-#ifdef WITH_MPI
-   mpierr = MPI_Barrier(MPI_COMM_WORLD);
-#endif
-   THIS_COMPLEX_ELPA_KERNEL_API = ELPA2_COMPLEX_KERNEL_GENERIC;
-   success = elpa_solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols, my_mpi_comm_world, THIS_COMPLEX_ELPA_KERNEL_API);
-
-   if (success != 1) {
-     printf("error in ELPA solve \n");
-#ifdef WITH_MPI
-     mpierr = MPI_Abort(MPI_COMM_WORLD, 99);
-#endif
-   }
-
-
-   if (myid == 0) {
-     printf("\n");
-     printf("2stage ELPA complex solver complete\n");
-     printf("\n");
-   }
-
-   /* check the results */
-   status = check_correctness_complex_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2);
-
-   if (status !=0){
-     printf("The computed EVs are not correct !\n");
-   }
-   if (status ==0){
-     if (myid == 0) {
-       printf("All ok!\n");
-     }
-   }
-
-   free(sc_desc);
-   free(a);
-   free(z);
-   free(as);
-
-   free(tmp1);
-   free(tmp2);
-#ifdef WITH_MPI
-   MPI_Finalize();
-#endif
-   return 0;
-}
diff -Nru elpa-2016.05.001/test/c_test_programs/elpa2_test_real_c_version.c elpa-2019.11.001/test/c_test_programs/elpa2_test_real_c_version.c
--- elpa-2016.05.001/test/c_test_programs/elpa2_test_real_c_version.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/c_test_programs/elpa2_test_real_c_version.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,240 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-
-#include "config-f90.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-#include <math.h>
-
-#include <elpa/elpa.h>
-#include <test/shared_sources/generated.h>
-
-int main(int argc, char** argv) {
-   int myid;
-   int nprocs;
-#ifndef WITH_MPI
-   int MPI_COMM_WORLD;
-#endif
-   int na, nev, nblk;
-
-   int status;
-
-   int np_cols, np_rows, np_colsStart;
-
-   int my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
-
-   int mpierr;
-
-   int my_mpi_comm_world;
-   int mpi_comm_rows, mpi_comm_cols;
-
-   int info, *sc_desc;
-
-   int na_rows, na_cols;
-   double startVal;
-
-   double *a, *z, *as, *ev, *tmp1, *tmp2;
-
-   int *iseed;
-
-   int success;
-
-   int useQr, THIS_REAL_ELPA_KERNEL_API;
-#ifdef WITH_MPI
-   MPI_Init(&argc, &argv);
-   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#else
-   nprocs = 1;
-   myid=0;
-   MPI_COMM_WORLD=1;
-#endif
-   na = 1000;
-   nev = 500;
-   nblk = 16;
-
-   if (myid == 0) {
-     printf("This is the c version of an ELPA test-programm\n");
-     printf("\n");
-     printf("It will call the 1stage ELPA real solver for an\n");
-     printf("of matrix size %d. It will compute %d eigenvalues\n",na,nev);
-     printf("and uses a blocksize of %d\n",nblk);
-     printf("\n");
-     printf("This is an example program with much less functionality\n");
-     printf("as it's Fortran counterpart. It's only purpose is to show how \n");
-     printf("to evoke ELPA1 from a c programm\n");
-     printf("\n");
-
-   }
-
-   status = 0;
-
-   startVal = sqrt((double) nprocs);
-   np_colsStart = (int) round(startVal);
-   for (np_cols=np_colsStart;np_cols>1;np_cols--){
-     if (nprocs %np_cols ==0){
-     break;
-     }
-   }
-
-   np_rows = nprocs/np_cols;
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Number of processor rows %d, cols %d, total %d \n",np_rows,np_cols,nprocs);
-   }
-
-   /* set up blacs */
-   /* convert communicators before */
-#ifdef WITH_MPI
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-#else
-  my_mpi_comm_world = 1;
-#endif
-   set_up_blacsgrid_from_fortran(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past BLACS_Gridinfo...\n");
-     printf("\n");
-   }
-
-   /* get the ELPA row and col communicators. */
-   /* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
-#ifdef WITH_MPI
-   my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
-#endif
-   mpierr = get_elpa_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past split communicator setup for rows and columns...\n");
-     printf("\n");
-   }
-
-   sc_desc = malloc(9*sizeof(int));
-
-   set_up_blacs_descriptor_from_fortran(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Past scalapack descriptor setup...\n");
-     printf("\n");
-   }
-
-   /* allocate the matrices needed for elpa */
-   if (myid == 0) {
-     printf("\n");
-     printf("Allocating matrices with na_rows=%d and na_cols=%d\n",na_rows, na_cols);
-     printf("\n");
-   }
-
-   a  = malloc(na_rows*na_cols*sizeof(double));
-   z  = malloc(na_rows*na_cols*sizeof(double));
-   as = malloc(na_rows*na_cols*sizeof(double));
-
-
-   ev = malloc(na*sizeof(double));
-
-   tmp1  = malloc(na_rows*na_cols*sizeof(double));
-   tmp2 = malloc(na_rows*na_cols*sizeof(double));
-
-   iseed = malloc(4096*sizeof(int));
-
-   prepare_matrix_real_from_fortran(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as);
-
-   if (myid == 0) {
-     printf("\n");
-     printf("Entering ELPA 2stage real solver\n");
-     printf("\n");
-   }
-#ifdef WITH_MPI
-   mpierr = MPI_Barrier(MPI_COMM_WORLD);
-#endif
-   useQr = 0;
-   THIS_REAL_ELPA_KERNEL_API = ELPA2_REAL_KERNEL_GENERIC;
-
-   success = elpa_solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols, my_mpi_comm_world, THIS_REAL_ELPA_KERNEL_API, useQr);
-
-   if (success != 1) {
-     printf("error in ELPA solve \n");
-#ifdef WITH_MPI
-     mpierr = MPI_Abort(MPI_COMM_WORLD, 99);
-#endif
-   }
-
-
-   if (myid == 0) {
-     printf("\n");
-     printf("2stage ELPA real solver complete\n");
-     printf("\n");
-   }
-
-   /* check the results */
-   status = check_correctness_real_from_fortran(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2);
-
-   if (status !=0){
-     printf("The computed EVs are not correct !\n");
-   }
-   if (status ==0){
-     if (myid ==0) {
-       printf("All ok!\n");
-     }
-   }
-
-   free(sc_desc);
-   free(a);
-   free(z);
-   free(as);
-
-   free(tmp1);
-   free(tmp2);
-#ifdef WITH_MPI
-   MPI_Finalize();
-#endif
-   return 0;
-}
diff -Nru elpa-2016.05.001/test/Fortran/assert.h elpa-2019.11.001/test/Fortran/assert.h
--- elpa-2016.05.001/test/Fortran/assert.h	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/assert.h	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,7 @@
+#define stringify_(x) "x"
+#define stringify(x) stringify_(x)
+#define assert(x) call x_a(x, stringify(x), "F", __LINE__)
+
+#define assert_elpa_ok(error_code) call x_ao(error_code, stringify(error_code), __FILE__, __LINE__)
+
+! vim: syntax=fortran
diff -Nru elpa-2016.05.001/test/Fortran/elpa2/complex_2stage_banded.F90 elpa-2019.11.001/test/Fortran/elpa2/complex_2stage_banded.F90
--- elpa-2016.05.001/test/Fortran/elpa2/complex_2stage_banded.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/elpa2/complex_2stage_banded.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,295 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+#include "../assert.h"
+!>
+!> Fortran test programm to demonstrates the use of
+!> ELPA 2 complex case library.
+!> If "HAVE_REDIRECT" was defined at build time
+!> the stdout and stderr output of each MPI task
+!> can be redirected to files if the environment
+!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
+!> to "true".
+!>
+!> By calling executable [arg1] [arg2] [arg3] [arg4]
+!> one can define the size (arg1), the number of
+!> Eigenvectors to compute (arg2), and the blocking (arg3).
+!> If these values are not set default values (500, 150, 16)
+!> are choosen.
+!> If these values are set the 4th argument can be
+!> "output", which specifies that the EV's are written to
+!> an ascii file.
+!>
+!> The complex ELPA 2 kernel is set as the default kernel.
+!> However, this can be overriden by setting
+!> the environment variable "COMPLEX_ELPA_KERNEL" to an
+!> appropiate value.
+!>
+
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+
+program test_complex2_double_banded
+
+!-------------------------------------------------------------------------------
+! Standard eigenvalue problem - COMPLEX version
+!
+! This program demonstrates the use of the ELPA module
+! together with standard scalapack routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!-------------------------------------------------------------------------------
+   use elpa
+
+   !use test_util
+   use test_read_input_parameters
+   use test_check_correctness
+   use test_setup_mpi
+   use test_blacs_infrastructure
+   use test_prepare_matrix
+#ifdef HAVE_REDIRECT
+   use test_redirect
+#endif
+   use test_output_type
+   implicit none
+
+   !-------------------------------------------------------------------------------
+   ! Please set system size parameters below!
+   ! na:   System size
+   ! nev:  Number of eigenvectors to be calculated
+   ! nblk: Blocking factor in block cyclic distribution
+   !-------------------------------------------------------------------------------
+
+   TEST_INT_TYPE              :: nblk
+   TEST_INT_TYPE              :: na, nev
+
+   TEST_INT_TYPE              :: np_rows, np_cols, na_rows, na_cols
+
+   TEST_INT_TYPE              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
+   TEST_INT_TYPE              :: i, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+   TEST_INT_MPI_TYPE          :: mpierr
+#ifdef WITH_MPI
+   !TEST_INT_TYPE, external    :: numroc
+#endif
+   complex(kind=ck8), parameter   :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8)
+   real(kind=rk8), allocatable    :: ev(:)
+
+   complex(kind=ck8), allocatable :: a(:,:), z(:,:), as(:,:)
+
+   TEST_INT_TYPE              :: STATUS
+#ifdef WITH_OPENMP
+   TEST_INT_TYPE              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
+#endif
+   type(output_t)                :: write_to_file
+   integer(kind=c_int)          :: error_elpa
+   character(len=8)              :: task_suffix
+   TEST_INT_TYPE              :: j
+
+
+   TEST_INT_TYPE              :: numberOfDevices
+   TEST_INT_TYPE              :: global_row, global_col, local_row, local_col
+   TEST_INT_TYPE              :: bandwidth
+   class(elpa_t), pointer        :: e
+
+#define COMPLEXCASE
+#define DOUBLE_PRECISION_COMPLEX 1
+
+   call read_input_parameters(na, nev, nblk, write_to_file)
+      !-------------------------------------------------------------------------------
+   !  MPI Initialization
+   call setup_mpi(myid, nprocs)
+
+   STATUS = 0
+
+   !-------------------------------------------------------------------------------
+   ! Selection of number of processor rows/columns
+   ! We try to set up the grid square-like, i.e. start the search for possible
+   ! divisors of nprocs with a number next to the square root of nprocs
+   ! and decrement it until a divisor is found.
+
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   ! at the end of the above loop, nprocs is always divisible by np_cols
+
+   np_rows = nprocs/np_cols
+
+   if(myid==0) then
+      print *
+      print '(a)','Standard eigenvalue problem - COMPLEX version'
+      print *
+      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
+      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
+      print *
+   endif
+
+   !-------------------------------------------------------------------------------
+   ! Set up BLACS context and MPI communicators
+   !
+   ! The BLACS context is only necessary for using Scalapack.
+   !
+   ! For ELPA, the MPI communicators along rows/cols are sufficient,
+   ! and the grid setup may be done in an arbitrary way as long as it is
+   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
+   ! process has a unique (my_prow,my_pcol) pair).
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
+                         my_blacs_ctxt, my_prow, my_pcol)
+
+   if (myid==0) then
+     print '(a)','| Past BLACS_Gridinfo.'
+   end if
+
+   ! Determine the necessary size of the distributed matrices,
+   ! we use the Scalapack tools routine NUMROC for that.
+
+   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   if (myid==0) then
+     print '(a)','| Past scalapack descriptor setup.'
+   end if
+   !-------------------------------------------------------------------------------
+   ! Allocate matrices and set up a test matrix for the eigenvalue problem
+
+   allocate(a (na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+   allocate(as(na_rows,na_cols))
+
+   allocate(ev(na))
+
+   call prepare_matrix_random(na, myid, sc_desc, a, z, as)
+
+   ! set values outside of the bandwidth to zero
+   bandwidth = nblk
+
+   do local_row = 1, na_rows
+     global_row = index_l2g( local_row, nblk, my_prow, np_rows )
+     do local_col = 1, na_cols
+       global_col = index_l2g( local_col, nblk, my_pcol, np_cols )
+
+       if (ABS(global_row-global_col) > bandwidth) then
+         a(local_row, local_col) = 0
+         as(local_row, local_col) = 0
+       end if
+     end do
+   end do
+
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+
+   e => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%set("na", int(na,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nev", int(nev,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#ifdef WITH_MPI
+   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+
+   call e%set("bandwidth", int(bandwidth,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   assert(e%setup() .eq. ELPA_OK)
+
+   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%eigenvectors(a, ev, z, error_elpa)
+   assert_elpa_ok(error_elpa)
+   call elpa_deallocate(e, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_uninit(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   !-------------------------------------------------------------------------------
+   ! Test correctness of result (using plain scalapack routines)
+   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+
+   deallocate(a)
+   deallocate(as)
+
+   deallocate(z)
+   deallocate(ev)
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+   call EXIT(STATUS)
+end
+
+!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/Fortran/elpa2/double_instance.F90 elpa-2019.11.001/test/Fortran/elpa2/double_instance.F90
--- elpa-2016.05.001/test/Fortran/elpa2/double_instance.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/elpa2/double_instance.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,244 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+#include "../assert.h"
+
+program test_interface
+   use elpa
+
+   use precision_for_tests
+   !use test_util
+   use test_setup_mpi
+   use test_prepare_matrix
+   use test_read_input_parameters
+   use test_blacs_infrastructure
+   use test_check_correctness
+   implicit none
+
+   ! matrix dimensions
+   TEST_INT_TYPE :: na, nev, nblk
+
+   ! mpi
+   TEST_INT_TYPE :: myid, nprocs
+   TEST_INT_TYPE :: na_cols, na_rows  ! local matrix size
+   TEST_INT_TYPE :: np_cols, np_rows  ! number of MPI processes per column/row
+   TEST_INT_TYPE :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
+   TEST_INT_MPI_TYPE :: mpierr
+
+   ! blacs
+   TEST_INT_TYPE :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+
+   ! The Matrix
+   real(kind=C_DOUBLE), allocatable :: a1(:,:), as1(:,:)
+   ! eigenvectors
+   real(kind=C_DOUBLE), allocatable :: z1(:,:)
+   ! eigenvalues
+   real(kind=C_DOUBLE), allocatable :: ev1(:)
+
+   ! The Matrix
+   complex(kind=C_DOUBLE_COMPLEX), allocatable :: a2(:,:), as2(:,:)
+   ! eigenvectors
+   complex(kind=C_DOUBLE_COMPLEX), allocatable :: z2(:,:)
+   ! eigenvalues
+   real(kind=C_DOUBLE), allocatable :: ev2(:)
+   TEST_INT_TYPE :: status
+   integer(kind=c_int) :: error_elpa
+
+   TEST_INT_TYPE :: solver
+   TEST_INT_TYPE :: qr
+
+   type(output_t) :: write_to_file
+   class(elpa_t), pointer :: e1, e2
+
+   call read_input_parameters(na, nev, nblk, write_to_file)
+   call setup_mpi(myid, nprocs)
+
+   status = 0
+
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+
+   np_rows = nprocs/np_cols
+
+   my_prow = mod(myid, np_cols)
+   my_pcol = myid / np_cols
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
+                         my_blacs_ctxt, my_prow, my_pcol)
+
+   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   allocate(a1 (na_rows,na_cols), as1(na_rows,na_cols))
+   allocate(z1 (na_rows,na_cols))
+   allocate(ev1(na))
+
+   a1(:,:) = 0.0
+   z1(:,:) = 0.0
+   ev1(:) = 0.0
+
+   call prepare_matrix_random(na, myid, sc_desc, a1, z1, as1)
+   allocate(a2 (na_rows,na_cols), as2(na_rows,na_cols))
+   allocate(z2 (na_rows,na_cols))
+   allocate(ev2(na))
+
+   a2(:,:) = 0.0
+   z2(:,:) = 0.0
+   ev2(:) = 0.0
+
+   call prepare_matrix_random(na, myid, sc_desc, a2, z2, as2)
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+
+   e1 => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e1%set("na", int(na,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e1%set("nev", int(nev,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e1%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e1%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e1%set("nblk", int(nblk,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#ifdef WITH_MPI
+   call e1%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e1%set("process_row", int(my_prow,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e1%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+
+   assert(e1%setup() .eq. ELPA_OK)
+
+   call e1%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e1%set("real_kernel", ELPA_2STAGE_REAL_DEFAULT, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+
+   e2 => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e2%set("na", int(na,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e2%set("nev", int(nev,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e2%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e2%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e2%set("nblk", int(nblk,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#ifdef WITH_MPI
+   call e2%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e2%set("process_row", int(my_prow,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e2%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+   assert(e2%setup() .eq. ELPA_OK)
+
+   call e2%set("solver", ELPA_SOLVER_1STAGE, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e1%eigenvectors(a1, ev1, z1, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_deallocate(e1, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e2%eigenvectors(a2, ev2, z2, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_deallocate(e2, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_uninit(error_elpa)
+
+   status = check_correctness_evp_numeric_residuals(na, nev, as1, z1, ev1, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+
+   deallocate(a1)
+   deallocate(as1)
+   deallocate(z1)
+   deallocate(ev1)
+
+   status = check_correctness_evp_numeric_residuals(na, nev, as2, z2, ev2, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+
+   deallocate(a2)
+   deallocate(as2)
+   deallocate(z2)
+   deallocate(ev2)
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+   call EXIT(STATUS)
+
+
+end program
diff -Nru elpa-2016.05.001/test/Fortran/elpa2/real_2stage_banded.F90 elpa-2019.11.001/test/Fortran/elpa2/real_2stage_banded.F90
--- elpa-2016.05.001/test/Fortran/elpa2/real_2stage_banded.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/elpa2/real_2stage_banded.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,294 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+#include "../assert.h"
+!>
+!> Fortran test programm to demonstrates the use of
+!> ELPA 2 real case library.
+!> If "HAVE_REDIRECT" was defined at build time
+!> the stdout and stderr output of each MPI task
+!> can be redirected to files if the environment
+!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
+!> to "true".
+!>
+!> By calling executable [arg1] [arg2] [arg3] [arg4]
+!> one can define the size (arg1), the number of
+!> Eigenvectors to compute (arg2), and the blocking (arg3).
+!> If these values are not set default values (500, 150, 16)
+!> are choosen.
+!> If these values are set the 4th argument can be
+!> "output", which specifies that the EV's are written to
+!> an ascii file.
+!>
+!> The real ELPA 2 kernel is set as the default kernel.
+!> However, this can be overriden by setting
+!> the environment variable "REAL_ELPA_KERNEL" to an
+!> appropiate value.
+!>
+
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+
+program test_real2_double_banded
+
+!-------------------------------------------------------------------------------
+! Standard eigenvalue problem - REAL version
+!
+! This program demonstrates the use of the ELPA module
+! together with standard scalapack routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+!-------------------------------------------------------------------------------
+   use elpa
+
+   !use test_util
+   use test_read_input_parameters
+   use test_check_correctness
+   use test_setup_mpi
+   use test_blacs_infrastructure
+   use test_prepare_matrix
+#ifdef HAVE_REDIRECT
+   use test_redirect
+#endif
+   use test_output_type
+   implicit none
+
+   !-------------------------------------------------------------------------------
+   ! Please set system size parameters below!
+   ! na:   System size
+   ! nev:  Number of eigenvectors to be calculated
+   ! nblk: Blocking factor in block cyclic distribution
+   !-------------------------------------------------------------------------------
+
+   TEST_INT_TYPE           :: nblk
+   TEST_INT_TYPE           :: na, nev
+
+   TEST_INT_TYPE           :: np_rows, np_cols, na_rows, na_cols
+
+   TEST_INT_TYPE           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
+   TEST_INT_TYPE           :: i,  my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+   TEST_INT_MPI_TYPE       :: mpierr
+   !TEST_INT_TYPE, external :: numroc
+
+   real(kind=rk8), allocatable :: a(:,:), z(:,:), as(:,:), ev(:)
+
+   TEST_INT_TYPE           :: STATUS
+#ifdef WITH_OPENMP
+   TEST_INT_TYPE           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
+#endif
+   integer(kind=c_int)          :: error_elpa
+   TEST_INT_TYPE           :: numberOfDevices
+   type(output_t)             :: write_to_file
+   character(len=8)           :: task_suffix
+   TEST_INT_TYPE           :: j
+   TEST_INT_TYPE           :: global_row, global_col, local_row, local_col
+   TEST_INT_TYPE           :: bandwidth
+   class(elpa_t), pointer     :: e
+#define DOUBLE_PRECISION_REAL 1
+
+
+   call read_input_parameters(na, nev, nblk, write_to_file)
+
+   !-------------------------------------------------------------------------------
+   !  MPI Initialization
+   call setup_mpi(myid, nprocs)
+
+   STATUS = 0
+
+#define REALCASE
+
+   !-------------------------------------------------------------------------------
+   ! Selection of number of processor rows/columns
+   ! We try to set up the grid square-like, i.e. start the search for possible
+   ! divisors of nprocs with a number next to the square root of nprocs
+   ! and decrement it until a divisor is found.
+
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   ! at the end of the above loop, nprocs is always divisible by np_cols
+
+   np_rows = nprocs/np_cols
+
+   if(myid==0) then
+      print *
+      print '(a)','Standard eigenvalue problem - REAL version'
+      print *
+      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
+      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
+      print *
+   endif
+
+   !-------------------------------------------------------------------------------
+   ! Set up BLACS context and MPI communicators
+   !
+   ! The BLACS context is only necessary for using Scalapack.
+   !
+   ! For ELPA, the MPI communicators along rows/cols are sufficient,
+   ! and the grid setup may be done in an arbitrary way as long as it is
+   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
+   ! process has a unique (my_prow,my_pcol) pair).
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
+                         my_blacs_ctxt, my_prow, my_pcol)
+
+   if (myid==0) then
+     print '(a)','| Past BLACS_Gridinfo.'
+   end if
+
+   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   if (myid==0) then
+     print '(a)','| Past scalapack descriptor setup.'
+   end if
+
+   !-------------------------------------------------------------------------------
+   ! Allocate matrices and set up a test matrix for the eigenvalue problem
+   allocate(a (na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+   allocate(as(na_rows,na_cols))
+
+   allocate(ev(na))
+
+   call prepare_matrix_random(na, myid, sc_desc, a, z, as)
+
+   ! set values outside of the bandwidth to zero
+   bandwidth = nblk
+
+   do local_row = 1, na_rows
+     global_row = index_l2g(local_row, nblk, my_prow, np_rows)
+     do local_col = 1, na_cols
+       global_col = index_l2g(local_col, nblk, my_pcol, np_cols)
+
+       if (ABS(global_row-global_col) > bandwidth) then
+         a(local_row, local_col) = 0.0
+         as(local_row, local_col) = 0.0
+       end if
+     end do
+   end do
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+   e => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%set("na", int(na,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nev", int(nev,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#ifdef WITH_MPI
+   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+
+   call e%set("bandwidth", int(bandwidth,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   assert(e%setup() .eq. ELPA_OK)
+
+   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%eigenvectors(a, ev, z, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_deallocate(e, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_uninit(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+
+   !-------------------------------------------------------------------------------
+   ! Test correctness of result (using plain scalapack routines)
+
+   
+   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+
+
+   deallocate(a)
+   deallocate(as)
+
+   deallocate(z)
+   deallocate(ev)
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+   call EXIT(STATUS)
+end
+
+!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/Fortran/elpa2/single_complex_2stage_banded.F90 elpa-2019.11.001/test/Fortran/elpa2/single_complex_2stage_banded.F90
--- elpa-2016.05.001/test/Fortran/elpa2/single_complex_2stage_banded.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/elpa2/single_complex_2stage_banded.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,295 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+
+#include "../assert.h"
+!>
+!> Fortran test programm to demonstrates the use of
+!> ELPA 2 complex case library.
+!> If "HAVE_REDIRECT" was defined at build time
+!> the stdout and stderr output of each MPI task
+!> can be redirected to files if the environment
+!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
+!> to "true".
+!>
+!> By calling executable [arg1] [arg2] [arg3] [arg4]
+!> one can define the size (arg1), the number of
+!> Eigenvectors to compute (arg2), and the blocking (arg3).
+!> If these values are not set default values (500, 150, 16)
+!> are choosen.
+!> If these values are set the 4th argument can be
+!> "output", which specifies that the EV's are written to
+!> an ascii file.
+!>
+!> The complex ELPA 2 kernel is set as the default kernel.
+!> However, this can be overriden by setting
+!> the environment variable "COMPLEX_ELPA_KERNEL" to an
+!> appropiate value.
+!>
+program test_complex2_single_banded
+
+!-------------------------------------------------------------------------------
+! Standard eigenvalue problem - COMPLEX version
+!
+! This program demonstrates the use of the ELPA module
+! together with standard scalapack routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!-------------------------------------------------------------------------------
+   use elpa
+
+   use test_util
+   use test_read_input_parameters
+   use test_check_correctness
+   use test_setup_mpi
+   use test_blacs_infrastructure
+   use test_prepare_matrix
+#ifdef HAVE_REDIRECT
+  use test_redirect
+#endif
+
+ use test_output_type
+   implicit none
+
+   !-------------------------------------------------------------------------------
+   ! Please set system size parameters below!
+   ! na:   System size
+   ! nev:  Number of eigenvectors to be calculated
+   ! nblk: Blocking factor in block cyclic distribution
+   !-------------------------------------------------------------------------------
+
+   TEST_INT_TYPE              :: nblk
+   TEST_INT_TYPE              :: na, nev
+
+   TEST_INT_TYPE              :: np_rows, np_cols, na_rows, na_cols
+
+   TEST_INT_TYPE              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
+   TEST_INT_TYPE              :: i, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+   TEST_INT_MPI_TYPE          :: mpierr
+#ifdef WITH_MPI
+   !TEST_INT_TYPE, external    :: numroc
+#endif
+   complex(kind=ck4), parameter   :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4)
+   real(kind=rk4), allocatable    :: ev(:)
+
+   complex(kind=ck4), allocatable :: a(:,:), z(:,:), as(:,:)
+
+   TEST_INT_TYPE              :: STATUS
+#ifdef WITH_OPENMP
+   TEST_INT_TYPE              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
+#endif
+   type(output_t)                :: write_to_file
+   integer(kind=ik)              :: error_elpa
+   character(len=8)              :: task_suffix
+   TEST_INT_TYPE              :: j
+
+
+   TEST_INT_TYPE              :: global_row, global_col, local_row, local_col
+   TEST_INT_TYPE              :: bandwidth
+   class(elpa_t), pointer        :: e
+
+#define COMPLEXCASE
+#define DOUBLE_PRECISION_COMPLEX 1
+
+   call read_input_parameters(na, nev, nblk, write_to_file)
+      !-------------------------------------------------------------------------------
+   !  MPI Initialization
+   call setup_mpi(myid, nprocs)
+
+   STATUS = 0
+
+   !-------------------------------------------------------------------------------
+   ! Selection of number of processor rows/columns
+   ! We try to set up the grid square-like, i.e. start the search for possible
+   ! divisors of nprocs with a number next to the square root of nprocs
+   ! and decrement it until a divisor is found.
+
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   ! at the end of the above loop, nprocs is always divisible by np_cols
+
+   np_rows = nprocs/np_cols
+
+   if(myid==0) then
+      print *
+      print '(a)','Standard eigenvalue problem - COMPLEX version'
+      print *
+      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
+      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
+      print *
+   endif
+
+   !-------------------------------------------------------------------------------
+   ! Set up BLACS context and MPI communicators
+   !
+   ! The BLACS context is only necessary for using Scalapack.
+   !
+   ! For ELPA, the MPI communicators along rows/cols are sufficient,
+   ! and the grid setup may be done in an arbitrary way as long as it is
+   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
+   ! process has a unique (my_prow,my_pcol) pair).
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
+                         my_blacs_ctxt, my_prow, my_pcol)
+
+   if (myid==0) then
+     print '(a)','| Past BLACS_Gridinfo.'
+   end if
+
+   ! Determine the necessary size of the distributed matrices,
+   ! we use the Scalapack tools routine NUMROC for that.
+
+   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   if (myid==0) then
+     print '(a)','| Past scalapack descriptor setup.'
+   end if
+   !-------------------------------------------------------------------------------
+   ! Allocate matrices and set up a test matrix for the eigenvalue problem
+
+   allocate(a (na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+   allocate(as(na_rows,na_cols))
+
+   allocate(ev(na))
+
+   call prepare_matrix_random(na, myid, sc_desc, a, z, as)
+
+   ! set values outside of the bandwidth to zero
+   bandwidth = nblk
+
+   do local_row = 1, na_rows
+     global_row = index_l2g( local_row, nblk, my_prow, np_rows )
+     do local_col = 1, na_cols
+       global_col = index_l2g( local_col, nblk, my_pcol, np_cols )
+
+       if (ABS(global_row-global_col) > bandwidth) then
+         a(local_row, local_col) = 0
+         as(local_row, local_col) = 0
+       end if
+     end do
+   end do
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+
+   e => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%set("na", int(na,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nev", int(nev,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#ifdef WITH_MPI
+   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+
+   call e%set("bandwidth", int(bandwidth,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   assert(e%setup() .eq. ELPA_OK)
+
+   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%eigenvectors(a, ev, z, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_deallocate(e, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_uninit(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   !-------------------------------------------------------------------------------
+   ! Test correctness of result (using plain scalapack routines)
+   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+
+   deallocate(a)
+   deallocate(as)
+
+   deallocate(z)
+   deallocate(ev)
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+   call EXIT(STATUS)
+end
+
+!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/Fortran/elpa2/single_real_2stage_banded.F90 elpa-2019.11.001/test/Fortran/elpa2/single_real_2stage_banded.F90
--- elpa-2016.05.001/test/Fortran/elpa2/single_real_2stage_banded.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/elpa2/single_real_2stage_banded.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,287 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+#include "../assert.h"
+!>
+!> Fortran test programm to demonstrates the use of
+!> ELPA 2 real case library.
+!> If "HAVE_REDIRECT" was defined at build time
+!> the stdout and stderr output of each MPI task
+!> can be redirected to files if the environment
+!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
+!> to "true".
+!>
+!> By calling executable [arg1] [arg2] [arg3] [arg4]
+!> one can define the size (arg1), the number of
+!> Eigenvectors to compute (arg2), and the blocking (arg3).
+!> If these values are not set default values (500, 150, 16)
+!> are choosen.
+!> If these values are set the 4th argument can be
+!> "output", which specifies that the EV's are written to
+!> an ascii file.
+!>
+!> The real ELPA 2 kernel is set as the default kernel.
+!> However, this can be overriden by setting
+!> the environment variable "REAL_ELPA_KERNEL" to an
+!> appropiate value.
+!>
+program test_real2_single_banded
+
+!-------------------------------------------------------------------------------
+! Standard eigenvalue problem - REAL version
+!
+! This program demonstrates the use of the ELPA module
+! together with standard scalapack routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+!-------------------------------------------------------------------------------
+   use elpa
+
+   !use test_util
+   use test_read_input_parameters
+   use test_check_correctness
+   use test_setup_mpi
+   use test_blacs_infrastructure
+   use test_prepare_matrix
+#ifdef HAVE_REDIRECT
+   use test_redirect
+#endif
+   use test_output_type
+   use tests_scalapack_interfaces
+   implicit none
+
+   !-------------------------------------------------------------------------------
+   ! Please set system size parameters below!
+   ! na:   System size
+   ! nev:  Number of eigenvectors to be calculated
+   ! nblk: Blocking factor in block cyclic distribution
+   !-------------------------------------------------------------------------------
+
+   TEST_INT_TYPE           :: nblk
+   TEST_INT_TYPE           :: na, nev
+
+   TEST_INT_TYPE           :: np_rows, np_cols, na_rows, na_cols
+
+   TEST_INT_TYPE           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
+   TEST_INT_TYPE           :: i, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+   TEST_INT_MPI_TYPE       :: mpierr
+
+   real(kind=rk4), allocatable :: a(:,:), z(:,:), as(:,:), ev(:)
+
+   TEST_INT_TYPE           :: STATUS
+#ifdef WITH_OPENMP
+   TEST_INT_TYPE           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
+#endif
+   integer(kind=c_int)     :: error_elpa
+   type(output_t)          :: write_to_file
+   character(len=8)        :: task_suffix
+   TEST_INT_TYPE           :: j
+   TEST_INT_TYPE           :: global_row, global_col, local_row, local_col
+   TEST_INT_TYPE           :: bandwidth
+   class(elpa_t), pointer  :: e
+#define DOUBLE_PRECISION_REAL 1
+
+   call read_input_parameters(na, nev, nblk, write_to_file)
+
+   !-------------------------------------------------------------------------------
+   !  MPI Initialization
+   call setup_mpi(myid, nprocs)
+
+
+   STATUS = 0
+
+#define REALCASE
+
+   !-------------------------------------------------------------------------------
+   ! Selection of number of processor rows/columns
+   ! We try to set up the grid square-like, i.e. start the search for possible
+   ! divisors of nprocs with a number next to the square root of nprocs
+   ! and decrement it until a divisor is found.
+
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   ! at the end of the above loop, nprocs is always divisible by np_cols
+
+   np_rows = nprocs/np_cols
+
+   if(myid==0) then
+      print *
+      print '(a)','Standard eigenvalue problem - REAL version'
+      print *
+      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
+      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
+      print *
+   endif
+
+   !-------------------------------------------------------------------------------
+   ! Set up BLACS context and MPI communicators
+   !
+   ! The BLACS context is only necessary for using Scalapack.
+   !
+   ! For ELPA, the MPI communicators along rows/cols are sufficient,
+   ! and the grid setup may be done in an arbitrary way as long as it is
+   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
+   ! process has a unique (my_prow,my_pcol) pair).
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
+                         my_blacs_ctxt, my_prow, my_pcol)
+
+   if (myid==0) then
+     print '(a)','| Past BLACS_Gridinfo.'
+   end if
+
+   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   if (myid==0) then
+     print '(a)','| Past scalapack descriptor setup.'
+   end if
+
+   !-------------------------------------------------------------------------------
+   ! Allocate matrices and set up a test matrix for the eigenvalue problem
+   allocate(a (na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+   allocate(as(na_rows,na_cols))
+
+   allocate(ev(na))
+
+   call prepare_matrix_random(na, myid, sc_desc, a, z, as)
+
+   ! set values outside of the bandwidth to zero
+   bandwidth = nblk
+
+   do local_row = 1, na_rows
+     global_row = index_l2g( local_row, nblk, my_prow, np_rows )
+     do local_col = 1, na_cols
+       global_col = index_l2g( local_col, nblk, my_pcol, np_cols )
+
+       if (ABS(global_row-global_col) > bandwidth) then
+         a(local_row, local_col) = 0.0
+         as(local_row, local_col) = 0.0
+       end if
+     end do
+   end do
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+   e => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%set("na", int(na,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nev", int(nev,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#ifdef WITH_MPI
+   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+
+   call e%set("bandwidth", int(bandwidth,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   assert(e%setup() .eq. ELPA_OK)
+
+   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%eigenvectors(a, ev, z, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_deallocate(e, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call elpa_uninit(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+
+   !-------------------------------------------------------------------------------
+   ! Test correctness of result (using plain scalapack routines)
+
+   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+   deallocate(a)
+   deallocate(as)
+
+   deallocate(z)
+   deallocate(ev)
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+   call EXIT(STATUS)
+end
+
+!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/Fortran/elpa_print_headers.F90 elpa-2019.11.001/test/Fortran/elpa_print_headers.F90
--- elpa-2016.05.001/test/Fortran/elpa_print_headers.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/elpa_print_headers.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,273 @@
+#if 0
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+#endif
+
+#ifdef WITH_OPENMP
+   if (myid .eq. 0) then
+      print *,"Threaded version of test program"
+      print *,"Using ",omp_get_max_threads()," threads"
+      print *," "
+   endif
+#endif
+
+#ifndef WITH_MPI
+   if (myid .eq. 0) then
+     print *,"This version of ELPA does not support MPI parallelisation"
+     print *,"For MPI support re-build ELPA with appropiate flags"
+     print *," "
+   endif
+#endif
+
+#ifdef ELPA1
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+   if (myid .eq. 0) then
+     print *," "
+     print *,"Real valued double-precision version of ELPA1 is used"
+     print *," "
+   endif
+#else
+   if (myid .eq. 0) then
+     print *," "
+     print *,"Real valued single-precision version of ELPA1 is used"
+     print *," "
+   endif
+#endif
+
+#endif
+
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+   if (myid .eq. 0) then
+     print *," "
+     print *,"Complex valued double-precision version of ELPA1 is used"
+     print *," "
+   endif
+#else
+   if (myid .eq. 0) then
+     print *," "
+     print *,"Complex valued single-precision version of ELPA1 is used"
+     print *," "
+   endif
+#endif
+
+#endif /* DATATYPE */
+
+#else /* ELPA1 */
+
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION_REAL
+   if (myid .eq. 0) then
+     print *," "
+     print *,"Real valued double-precision version of ELPA2 is used"
+     print *," "
+   endif
+#else
+   if (myid .eq. 0) then
+     print *," "
+     print *,"Real valued single-precision version of ELPA2 is used"
+     print *," "
+   endif
+#endif
+
+#endif
+
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION_COMPLEX
+   if (myid .eq. 0) then
+     print *," "
+     print *,"Complex valued double-precision version of ELPA2 is used"
+     print *," "
+   endif
+#else
+   if (myid .eq. 0) then
+     print *," "
+     print *,"Complex valued single-precision version of ELPA2 is used"
+     print *," "
+   endif
+#endif
+
+#endif /* DATATYPE */
+
+#endif /* ELPA1 */
+
+#ifdef WITH_MPI
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+#endif
+#ifdef HAVE_REDIRECT
+   if (check_redirect_environment_variable()) then
+     if (myid .eq. 0) then
+       print *," "
+       print *,"Redirection of mpi processes is used"
+       print *," "
+       if (create_directories() .ne. 1) then
+         write(error_unit,*) "Unable to create directory for stdout and stderr!"
+         stop 1
+       endif
+     endif
+#ifdef WITH_MPI
+     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+#endif
+     call redirect_stdout(myid)
+   endif
+#endif
+
+#ifndef ELPA1
+
+   if (myid .eq. 0) then
+      print *," "
+      print *,"This ELPA2 is build with"
+#ifdef WITH_GPU_KERNEL
+        print *,"GPU support"
+#endif
+      print *," "
+#ifdef REALCASE
+
+#ifdef HAVE_AVX2
+
+#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
+      print *,"AVX2 optimized kernel (2 blocking) for real matrices"
+#endif
+#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
+      print *,"AVX2 optimized kernel (4 blocking) for real matrices"
+#endif
+#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
+      print *,"AVX2 optimized kernel (6 blocking) for real matrices"
+#endif
+
+#else /* no HAVE_AVX2 */
+
+#ifdef HAVE_AVX
+
+#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
+      print *,"AVX optimized kernel (2 blocking) for real matrices"
+#endif
+#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
+      print *,"AVX optimized kernel (4 blocking) for real matrices"
+#endif
+#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
+      print *,"AVX optimized kernel (6 blocking) for real matrices"
+#endif
+
+#endif
+
+#endif /* HAVE_AVX2 */
+
+
+#ifdef WITH_REAL_GENERIC_KERNEL
+     print *,"GENERIC kernel for real matrices"
+#endif
+#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
+     print *,"GENERIC SIMPLE kernel for real matrices"
+#endif
+#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
+     print *,"SSE ASSEMBLER kernel for real matrices"
+#endif
+#ifdef WITH_REAL_BGP_KERNEL
+     print *,"BGP kernel for real matrices"
+#endif
+#ifdef WITH_REAL_BGQ_KERNEL
+     print *,"BGQ kernel for real matrices"
+#endif
+
+#endif /* DATATYPE == REAL */
+
+#ifdef COMPLEXCASE
+
+#ifdef HAVE_AVX2
+
+#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
+      print *,"AVX2 optimized kernel (2 blocking) for complex matrices"
+#endif
+#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
+      print *,"AVX2 optimized kernel (1 blocking) for complex matrices"
+#endif
+
+#else /* no HAVE_AVX2 */
+
+#ifdef HAVE_AVX
+
+#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
+      print *,"AVX optimized kernel (2 blocking) for complex matrices"
+#endif
+#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
+      print *,"AVX optimized kernel (1 blocking) for complex matrices"
+#endif
+
+#endif
+
+#endif /* HAVE_AVX2 */
+
+
+#ifdef WITH_COMPLEX_GENERIC_KERNEL
+     print *,"GENERIC kernel for complex matrices"
+#endif
+#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
+     print *,"GENERIC SIMPLE kernel for complex matrices"
+#endif
+#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
+     print *,"SSE ASSEMBLER kernel for complex matrices"
+#endif
+
+#endif /* DATATYPE == COMPLEX */
+
+   endif
+#endif /* ELPA1 */
+
+   if (write_to_file%eigenvectors) then
+     if (myid .eq. 0) print *,"Writing Eigenvectors to files"
+   endif
+
+   if (write_to_file%eigenvalues) then
+     if (myid .eq. 0) print *,"Writing Eigenvalues to files"
+   endif
+
+
diff -Nru elpa-2016.05.001/test/Fortran/test_autotune.F90 elpa-2019.11.001/test/Fortran/test_autotune.F90
--- elpa-2016.05.001/test/Fortran/test_autotune.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/test_autotune.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,306 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+! Define one of TEST_REAL or TEST_COMPLEX
+! Define one of TEST_SINGLE or TEST_DOUBLE
+! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
+! Define TEST_GPU \in [0, 1]
+! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
+
+#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
+error: define exactly one of TEST_REAL or TEST_COMPLEX
+#endif
+
+#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
+error: define exactly one of TEST_SINGLE or TEST_DOUBLE
+#endif
+
+#ifdef TEST_SINGLE
+#  define EV_TYPE real(kind=C_FLOAT)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_FLOAT)
+#  else
+#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
+#  endif
+#else
+#  define EV_TYPE real(kind=C_DOUBLE)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_DOUBLE)
+#  else
+#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
+#  endif
+#endif
+
+
+#ifdef TEST_REAL
+#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_REAL
+#else
+#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_COMPLEX
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+#include "assert.h"
+
+program test
+   use elpa
+
+   !use test_util
+   use test_setup_mpi
+   use test_prepare_matrix
+   use test_read_input_parameters
+   use test_blacs_infrastructure
+   use test_check_correctness
+   use test_analytic
+   use iso_fortran_env
+
+#ifdef HAVE_REDIRECT
+   use test_redirect
+#endif
+   implicit none
+
+   ! matrix dimensions
+   TEST_INT_TYPE                     :: na, nev, nblk
+
+   ! mpi
+   TEST_INT_TYPE                     :: myid, nprocs
+   TEST_INT_TYPE                     :: na_cols, na_rows  ! local matrix size
+   TEST_INT_TYPE                     :: np_cols, np_rows  ! number of MPI processes per column/row
+   TEST_INT_TYPE                     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
+   TEST_INT_MPI_TYPE                 :: mpierr
+
+   ! blacs
+   character(len=1)            :: layout
+   TEST_INT_TYPE                     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+
+   ! The Matrix
+   MATRIX_TYPE, allocatable    :: a(:,:), as(:,:)
+   ! eigenvectors
+   MATRIX_TYPE, allocatable    :: z(:,:)
+   ! eigenvalues
+   EV_TYPE, allocatable        :: ev(:)
+
+   TEST_INT_TYPE               :: status
+   integer(kind=c_int)         :: error_elpa
+
+   type(output_t)              :: write_to_file
+   class(elpa_t), pointer      :: e
+   class(elpa_autotune_t), pointer :: tune_state
+
+   TEST_INT_TYPE                     :: iter
+   character(len=5)            :: iter_string
+
+   call read_input_parameters(na, nev, nblk, write_to_file)
+   call setup_mpi(myid, nprocs)
+#ifdef HAVE_REDIRECT
+#ifdef WITH_MPI
+   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+   call redirect_stdout(myid)
+#endif
+#endif
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+
+   layout = 'C'
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   np_rows = nprocs/np_cols
+   assert(nprocs == np_rows * np_cols)
+
+   if (myid == 0) then
+     print '((a,i0))', 'Matrix size: ', na
+     print '((a,i0))', 'Num eigenvectors: ', nev
+     print '((a,i0))', 'Blocksize: ', nblk
+#ifdef WITH_MPI
+     print '((a,i0))', 'Num MPI proc: ', nprocs
+     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
+     print '(a)',      'Process layout: ' // layout
+#endif
+     print *,''
+   endif
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, layout, &
+                         my_blacs_ctxt, my_prow, my_pcol)
+
+   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   allocate(a (na_rows,na_cols))
+   allocate(as(na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+   allocate(ev(na))
+
+   a(:,:) = 0.0
+   z(:,:) = 0.0
+   ev(:) = 0.0
+
+   call prepare_matrix_analytic(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol, print_times=.false.)
+   as(:,:) = a(:,:)
+
+   e => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%set("na", int(na,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nev", int(nev,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+
+#ifdef WITH_MPI
+   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+   call e%set("timings",1, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%set("debug",1, error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("gpu", 0, error_elpa)
+   assert_elpa_ok(error_elpa)
+   !call e%set("max_stored_rows", 15, error_elpa)
+
+   assert_elpa_ok(e%setup())
+
+   if (myid == 0) print *, ""
+
+   tune_state => e%autotune_setup(ELPA_AUTOTUNE_MEDIUM, AUTOTUNE_DOMAIN, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   iter=0
+   do while (e%autotune_step(tune_state, error_elpa))
+     assert_elpa_ok(error_elpa)
+     iter=iter+1
+     write(iter_string,'(I5.5)') iter
+     !call e%print_settings()
+     !call e%store_settings("saved_parameters_"//trim(iter_string)//".txt")
+     call e%timer_start("eigenvectors: iteration "//trim(iter_string))
+     call e%eigenvectors(a, ev, z, error_elpa)
+     assert_elpa_ok(error_elpa)
+     call e%timer_stop("eigenvectors: iteration "//trim(iter_string))
+
+     assert_elpa_ok(error_elpa)
+     if (myid .eq. 0) then
+       print *, ""
+       call e%print_times("eigenvectors: iteration "//trim(iter_string))
+     endif
+     status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
+                                         .true., .true., print_times=.false.)
+     a(:,:) = as(:,:)
+     !call e%autotune_print_state(tune_state)
+     !call e%autotune_save_state(tune_state, "saved_state_"//trim(iter_string)//".txt")
+   end do
+
+   ! set and print the autotuned-settings
+   call e%autotune_set_best(tune_state, error_elpa)
+   assert_elpa_ok(error_elpa)
+   if (myid .eq. 0) then
+     print *, "The best combination found by the autotuning:"
+     flush(output_unit)
+     call e%autotune_print_best(tune_state, error_elpa)
+     assert_elpa_ok(error_elpa)
+   endif
+   ! de-allocate autotune object
+   call elpa_autotune_deallocate(tune_state, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   if (myid .eq. 0) then
+     print *, "Running once more time with the best found setting..."
+   endif
+   call e%timer_start("eigenvectors: best setting")
+   call e%eigenvectors(a, ev, z, error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%timer_stop("eigenvectors: best setting")
+   assert_elpa_ok(error_elpa)
+   if (myid .eq. 0) then
+     print *, ""
+     call e%print_times("eigenvectors: best setting")
+   endif
+   status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
+                                       .true., .true., print_times=.false.)
+
+   call elpa_deallocate(e,error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   deallocate(a)
+   deallocate(as)
+   deallocate(z)
+   deallocate(ev)
+
+   call elpa_uninit(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+
+   call exit(status)
+
+end program
diff -Nru elpa-2016.05.001/test/Fortran/test.F90 elpa-2019.11.001/test/Fortran/test.F90
--- elpa-2016.05.001/test/Fortran/test.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/test.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,901 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+! Define one of TEST_REAL or TEST_COMPLEX
+! Define one of TEST_SINGLE or TEST_DOUBLE
+! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
+! Define TEST_GPU \in [0, 1]
+! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
+
+#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
+error: define exactly one of TEST_REAL or TEST_COMPLEX
+#endif
+
+#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
+error: define exactly one of TEST_SINGLE or TEST_DOUBLE
+#endif
+
+#if !(defined(TEST_SOLVER_1STAGE) ^ defined(TEST_SOLVER_2STAGE) ^ defined(TEST_SCALAPACK_ALL) ^ defined(TEST_SCALAPACK_PART))
+error: define exactly one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE or TEST_SCALAPACK_ALL or TEST_SCALAPACK_PART
+#endif
+
+#ifdef TEST_SOLVER_1STAGE
+#ifdef TEST_ALL_KERNELS
+error: TEST_ALL_KERNELS cannot be defined for TEST_SOLVER_1STAGE
+#endif
+#ifdef TEST_KERNEL
+error: TEST_KERNEL cannot be defined for TEST_SOLVER_1STAGE
+#endif
+#endif
+
+#ifdef TEST_SOLVER_2STAGE
+#if !(defined(TEST_KERNEL) ^ defined(TEST_ALL_KERNELS))
+error: define either TEST_ALL_KERNELS or a valid TEST_KERNEL
+#endif
+#endif
+
+#ifdef TEST_GENERALIZED_DECOMP_EIGENPROBLEM
+#define TEST_GENERALIZED_EIGENPROBLEM
+#endif
+
+#ifdef TEST_SINGLE
+#  define EV_TYPE real(kind=C_FLOAT)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_FLOAT)
+#  else
+#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
+#  endif
+#else
+#  define EV_TYPE real(kind=C_DOUBLE)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_DOUBLE)
+#  else
+#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
+#  endif
+#endif
+
+#ifdef TEST_REAL
+#define KERNEL_KEY "real_kernel"
+#endif
+#ifdef TEST_COMPLEX
+#define KERNEL_KEY "complex_kernel"
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+#include "assert.h"
+
+program test
+   use elpa 
+   !use test_util
+   use test_setup_mpi
+   use test_prepare_matrix
+   use test_read_input_parameters
+   use test_blacs_infrastructure
+   use test_check_correctness
+   use test_analytic
+#ifdef WITH_SCALAPACK_TESTS
+   use test_scalapack
+#endif
+
+#ifdef HAVE_REDIRECT
+   use test_redirect
+#endif
+#ifdef WITH_OPENMP
+   use omp_lib
+#endif
+   use precision_for_tests
+
+   implicit none
+
+   ! matrix dimensions
+   TEST_INT_TYPE     :: na, nev, nblk
+
+   ! mpi
+   TEST_INT_TYPE     :: myid, nprocs
+   TEST_INT_MPI_TYPE :: myidMPI, nprocsMPI
+   TEST_INT_TYPE     :: na_cols, na_rows  ! local matrix size
+   TEST_INT_TYPE     :: np_cols, np_rows  ! number of MPI processes per column/row
+   TEST_INT_TYPE     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
+   TEST_INT_MPI_TYPE :: mpierr
+
+   ! blacs
+   TEST_INT_TYPE     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+
+   ! The Matrix
+   MATRIX_TYPE, allocatable    :: a(:,:), as(:,:)
+#if defined(TEST_HERMITIAN_MULTIPLY)
+   MATRIX_TYPE, allocatable    :: b(:,:), c(:,:)
+#endif
+#if defined(TEST_GENERALIZED_EIGENPROBLEM)
+   MATRIX_TYPE, allocatable    :: b(:,:), bs(:,:)
+#endif
+   ! eigenvectors
+   MATRIX_TYPE, allocatable    :: z(:,:)
+   ! eigenvalues
+   EV_TYPE, allocatable        :: ev(:)
+
+   logical                     :: check_all_evals, skip_check_correctness
+
+#if defined(TEST_MATRIX_TOEPLITZ) || defined(TEST_MATRIX_FRANK)
+   EV_TYPE, allocatable        :: d(:), sd(:), ds(:), sds(:)
+   EV_TYPE                     :: diagonalELement, subdiagonalElement
+#endif
+
+   TEST_INT_TYPE               :: status
+   integer(kind=c_int)         :: error_elpa
+
+   type(output_t)              :: write_to_file
+   class(elpa_t), pointer      :: e
+#ifdef TEST_ALL_KERNELS
+   TEST_INT_TYPE      :: i
+#endif
+#ifdef TEST_ALL_LAYOUTS
+   character(len=1), parameter :: layouts(2) = [ 'C', 'R' ]
+   TEST_INT_TYPE      :: i_layout
+#endif
+   integer(kind=c_int):: kernel
+   character(len=1)   :: layout
+   logical            :: do_test_numeric_residual, do_test_numeric_residual_generalized, &
+                         do_test_analytic_eigenvalues, &
+                         do_test_analytic_eigenvalues_eigenvectors,   &
+                         do_test_frank_eigenvalues,  &
+                         do_test_toeplitz_eigenvalues, do_test_cholesky,   &
+                         do_test_hermitian_multiply
+
+#ifdef WITH_OPENMP
+   TEST_INT_TYPE      :: max_threads, threads_caller
+#endif
+
+#ifdef SPLIT_COMM_MYSELF
+   TEST_INT_MPI_TYPE  :: mpi_comm_rows, mpi_comm_cols, mpi_string_length, mpierr2
+   character(len=MPI_MAX_ERROR_STRING) :: mpierr_string
+#endif
+
+   call read_input_parameters_traditional(na, nev, nblk, write_to_file, skip_check_correctness)
+   call setup_mpi(myid, nprocs)
+
+#ifdef HAVE_REDIRECT
+#ifdef WITH_MPI
+     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+     call redirect_stdout(myid)
+#endif
+#endif
+
+   check_all_evals = .true.
+
+
+   do_test_numeric_residual = .false.
+   do_test_numeric_residual_generalized = .false.
+   do_test_analytic_eigenvalues = .false.
+   do_test_analytic_eigenvalues_eigenvectors = .false.
+   do_test_frank_eigenvalues = .false.
+   do_test_toeplitz_eigenvalues = .false. 
+
+   do_test_cholesky = .false.
+#if defined(TEST_CHOLESKY)
+   do_test_cholesky = .true.
+#endif
+   do_test_hermitian_multiply = .false.
+#if defined(TEST_HERMITIAN_MULTIPLY)
+   do_test_hermitian_multiply = .true.
+#endif
+
+   status = 0
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+
+   if (myid == 0) then
+     print '((a,i0))', 'Program ' // TEST_CASE
+     print *, ""
+   endif
+
+#ifdef TEST_ALL_LAYOUTS
+   do i_layout = 1, size(layouts)               ! layouts
+     layout = layouts(i_layout)
+     do np_cols = 1, nprocs                     ! factors
+       if (mod(nprocs,np_cols) /= 0 ) then
+         cycle
+       endif
+#else
+   layout = 'C'
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+#endif
+
+   np_rows = nprocs/np_cols
+   assert(nprocs == np_rows * np_cols)
+
+   if (myid == 0) then
+     print '((a,i0))', 'Matrix size: ', na
+     print '((a,i0))', 'Num eigenvectors: ', nev
+     print '((a,i0))', 'Blocksize: ', nblk
+#ifdef WITH_MPI
+     print '((a,i0))', 'Num MPI proc: ', nprocs
+     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
+     print '(a)',      'Process layout: ' // layout
+#endif
+     print *,''
+   endif
+
+#if TEST_QR_DECOMPOSITION == 1
+
+#if TEST_GPU == 1
+#ifdef WITH_MPI
+     call mpi_finalize(mpierr)
+#endif
+     stop 77
+#endif /* TEST_GPU */
+   if (nblk .lt. 64) then
+     if (myid .eq. 0) then
+       print *,"At the moment QR decomposition need blocksize of at least 64"
+     endif
+     if ((na .lt. 64) .and. (myid .eq. 0)) then
+       print *,"This is why the matrix size must also be at least 64 or only 1 MPI task can be used"
+     endif
+
+#ifdef WITH_MPI
+     call mpi_finalize(mpierr)
+#endif
+     stop 77
+   endif
+#endif /* TEST_QR_DECOMPOSITION */
+
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, &
+                         np_cols, layout, my_blacs_ctxt, my_prow, &
+                         my_pcol)
+
+   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
+                                np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   allocate(a (na_rows,na_cols))
+   allocate(as(na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+   allocate(ev(na))
+
+#ifdef TEST_HERMITIAN_MULTIPLY
+   allocate(b (na_rows,na_cols))
+   allocate(c (na_rows,na_cols))
+#endif
+
+#ifdef TEST_GENERALIZED_EIGENPROBLEM
+   allocate(b (na_rows,na_cols))
+   allocate(bs (na_rows,na_cols))
+#endif
+
+#if defined(TEST_MATRIX_TOEPLITZ) || defined(TEST_MATRIX_FRANK)
+   allocate(d (na), ds(na))
+   allocate(sd (na), sds(na))
+#endif
+
+   a(:,:) = 0.0
+   z(:,:) = 0.0
+   ev(:) = 0.0
+
+#if defined(TEST_MATRIX_RANDOM) && !defined(TEST_SOLVE_TRIDIAGONAL) && !defined(TEST_CHOLESKY) && !defined(TEST_EIGENVALUES)
+   ! the random matrix can be used in allmost all tests; but for some no
+   ! correctness checks have been implemented; do not allow these
+   ! combinations
+   ! RANDOM + TEST_SOLVE_TRIDIAGONAL: we need a TOEPLITZ MATRIX
+   ! RANDOM + TEST_CHOLESKY: wee need SPD matrix
+   ! RANDOM + TEST_EIGENVALUES: no correctness check known
+
+   ! We also have to take care of special case in TEST_EIGENVECTORS
+#if !defined(TEST_EIGENVECTORS)
+    call prepare_matrix_random(na, myid, sc_desc, a, z, as)
+#else /* TEST_EIGENVECTORS */
+    if (nev .ge. 1) then
+      call prepare_matrix_random(na, myid, sc_desc, a, z, as)
+#ifndef TEST_HERMITIAN_MULTIPLY
+      do_test_numeric_residual = .true.
+#endif
+   else
+     if (myid .eq. 0) then
+       print *,"At the moment with the random matrix you need nev >=1"
+     endif
+#ifdef WITH_MPI
+     call mpi_finalize(mpierr)
+#endif
+     stop 77
+   endif
+#endif /* TEST_EIGENVECTORS */
+    do_test_analytic_eigenvalues = .false.
+    do_test_analytic_eigenvalues_eigenvectors = .false.
+    do_test_frank_eigenvalues = .false.
+    do_test_toeplitz_eigenvalues = .false.
+#endif /* (TEST_MATRIX_RANDOM) */
+
+#if defined(TEST_MATRIX_RANDOM) && defined(TEST_CHOLESKY)
+     call prepare_matrix_random_spd(na, myid, sc_desc, a, z, as, &
+                 nblk, np_rows, np_cols, my_prow, my_pcol)
+    do_test_analytic_eigenvalues = .false.
+    do_test_analytic_eigenvalues_eigenvectors = .false.
+    do_test_frank_eigenvalues = .false.
+    do_test_toeplitz_eigenvalues = .false.
+#endif /* TEST_MATRIX_RANDOM and TEST_CHOLESKY */
+
+#if defined(TEST_MATRIX_RANDOM) && defined(TEST_GENERALIZED_EIGENPROBLEM)
+   ! call prepare_matrix_random(na, myid, sc_desc, a, z, as)
+    call prepare_matrix_random_spd(na, myid, sc_desc, b, z, bs, &
+                 nblk, np_rows, np_cols, my_prow, my_pcol)
+    do_test_analytic_eigenvalues = .false.
+    do_test_analytic_eigenvalues_eigenvectors = .false.
+    do_test_frank_eigenvalues = .false.
+    do_test_toeplitz_eigenvalues = .false.
+    do_test_numeric_residual = .false.
+    do_test_numeric_residual_generalized = .true.
+#endif /* TEST_MATRIX_RANDOM and TEST_GENERALIZED_EIGENPROBLEM */
+
+#if defined(TEST_MATRIX_RANDOM) && (defined(TEST_SOLVE_TRIDIAGONAL) || defined(TEST_EIGENVALUES))
+#error "Random matrix is not allowed in this configuration"
+#endif
+
+#if defined(TEST_MATRIX_ANALYTIC)  && !defined(TEST_SOLVE_TRIDIAGONAL) && !defined(TEST_CHOLESKY)
+   ! the analytic matrix can be used in allmost all tests; but for some no
+   ! correctness checks have been implemented; do not allow these
+   ! combinations
+   ! ANALYTIC + TEST_SOLVE_TRIDIAGONAL: we need a TOEPLITZ MATRIX
+   ! ANALTIC  + TEST_CHOLESKY: no correctness check yet implemented
+
+   call prepare_matrix_analytic(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+   as(:,:) = a
+
+   do_test_numeric_residual = .false.
+   do_test_analytic_eigenvalues_eigenvectors = .false.
+#ifndef TEST_HERMITIAN_MULTIPLY
+   do_test_analytic_eigenvalues = .true.
+#endif
+#if defined(TEST_EIGENVECTORS)
+   if (nev .ge. 1) then
+     do_test_analytic_eigenvalues_eigenvectors = .true.
+     do_test_analytic_eigenvalues = .false.
+   else
+     do_test_analytic_eigenvalues_eigenvectors = .false.
+   endif
+#endif
+   do_test_frank_eigenvalues = .false.
+   do_test_toeplitz_eigenvalues = .false.
+#endif /* TEST_MATRIX_ANALYTIC */
+#if defined(TEST_MATRIX_ANALYTIC) && (defined(TEST_SOLVE_TRIDIAGONAL) || defined(TEST_CHOLESKY))
+#error "Analytic matrix is not allowd in this configuration"
+#endif
+
+#if defined(TEST_MATRIX_TOEPLITZ)
+   ! The Toeplitz matrix works in each test
+#ifdef TEST_SINGLE
+   diagonalElement = 0.45_c_float
+   subdiagonalElement =  0.78_c_float
+#else
+   diagonalElement = 0.45_c_double
+   subdiagonalElement =  0.78_c_double
+#endif
+
+! actually we test cholesky for diagonal matrix only
+#if defined(TEST_CHOLESKY)
+#ifdef TEST_SINGLE
+  diagonalElement = (2.546_c_float, 0.0_c_float)
+  subdiagonalElement =  (0.0_c_float, 0.0_c_float)
+#else
+  diagonalElement = (2.546_c_double, 0.0_c_double)
+  subdiagonalElement =  (0.0_c_double, 0.0_c_double)
+#endif
+#endif /* TEST_CHOLESKY */
+
+   call prepare_matrix_toeplitz(na, diagonalElement, subdiagonalElement, &
+                                d, sd, ds, sds, a, as, nblk, np_rows, &
+                                np_cols, my_prow, my_pcol)
+
+
+   do_test_numeric_residual = .false.
+#if defined(TEST_EIGENVECTORS)
+   if (nev .ge. 1) then
+     do_test_numeric_residual = .true.
+   else
+     do_test_numeric_residual = .false.
+   endif
+#endif
+
+   do_test_analytic_eigenvalues = .false.
+   do_test_analytic_eigenvalues_eigenvectors = .false.
+   do_test_frank_eigenvalues = .false.
+#if defined(TEST_CHOLESKY)
+   do_test_toeplitz_eigenvalues = .false.
+#else
+   do_test_toeplitz_eigenvalues = .true.
+#endif
+
+#endif /* TEST_MATRIX_TOEPLITZ */
+
+
+#if defined(TEST_MATRIX_FRANK) && !defined(TEST_SOLVE_TRIDIAGONAL) && !defined(TEST_CHOLESKY)
+   ! the random matrix can be used in allmost all tests; but for some no
+   ! correctness checks have been implemented; do not allow these
+   ! combinations
+   ! FRANK + TEST_SOLVE_TRIDIAGONAL: we need a TOEPLITZ MATRIX
+   ! FRANK + TEST_CHOLESKY: no correctness check yet implemented
+
+   ! We also have to take care of special case in TEST_EIGENVECTORS
+#if !defined(TEST_EIGENVECTORS)
+    call prepare_matrix_frank(na, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
+
+    do_test_analytic_eigenvalues = .false.
+    do_test_analytic_eigenvalues_eigenvectors = .false.
+#ifndef TEST_HERMITIAN_MULTIPLY
+    do_test_frank_eigenvalues = .true.
+#endif
+    do_test_toeplitz_eigenvalues = .false.
+
+#else /* TEST_EIGENVECTORS */
+
+    if (nev .ge. 1) then
+      call prepare_matrix_frank(na, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
+
+    do_test_analytic_eigenvalues = .false.
+    do_test_analytic_eigenvalues_eigenvectors = .false.
+#ifndef TEST_HERMITIAN_MULTIPLY
+    do_test_frank_eigenvalues = .true.
+#endif
+    do_test_toeplitz_eigenvalues = .false.
+    do_test_numeric_residual = .false.
+   else
+    do_test_analytic_eigenvalues = .false.
+    do_test_analytic_eigenvalues_eigenvectors = .false.
+#ifndef TEST_HERMITIAN_MULTIPLY
+    do_test_frank_eigenvalues = .true.
+#endif
+    do_test_toeplitz_eigenvalues = .false.
+    do_test_numeric_residual = .false.
+
+   endif
+
+#endif /* TEST_EIGENVECTORS */
+#endif /* (TEST_MATRIX_FRANK) */
+#if defined(TEST_MATRIX_FRANK) && (defined(TEST_SOLVE_TRIDIAGONAL) || defined(TEST_CHOLESKY))
+#error "FRANK matrix is not allowed in this configuration"
+#endif
+
+
+#ifdef TEST_HERMITIAN_MULTIPLY
+#ifdef TEST_REAL
+
+#ifdef TEST_DOUBLE
+   b(:,:) = 2.0_c_double * a(:,:)
+   c(:,:) = 0.0_c_double
+#else
+   b(:,:) = 2.0_c_float * a(:,:)
+   c(:,:) = 0.0_c_float
+#endif
+
+#endif /* TEST_REAL */
+
+#ifdef TEST_COMPLEX
+
+#ifdef TEST_DOUBLE
+   b(:,:) = 2.0_c_double * a(:,:)
+   c(:,:) = (0.0_c_double, 0.0_c_double)
+#else
+   b(:,:) = 2.0_c_float * a(:,:)
+   c(:,:) = (0.0_c_float, 0.0_c_float)
+#endif
+
+#endif /* TEST_COMPLEX */
+
+#endif /* TEST_HERMITIAN_MULTIPLY */
+
+! if the test is used for (repeated) performacne tests, one might want to skip the checking
+! of the results, which might be time-consuming and not necessary.
+   if(skip_check_correctness) then
+     do_test_numeric_residual = .false.
+     do_test_numeric_residual_generalized = .false.
+     do_test_analytic_eigenvalues = .false.
+     do_test_analytic_eigenvalues_eigenvectors = .false.
+     do_test_frank_eigenvalues = .false.
+     do_test_toeplitz_eigenvalues = .false. 
+     do_test_cholesky = .false.
+   endif
+
+
+#ifdef WITH_OPENMP
+   threads_caller = omp_get_max_threads()
+   if (myid == 0) then
+     print *,"The calling program uses ",threads_caller," threads"
+   endif
+#endif
+
+   e => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e%set("na", int(na,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nev", int(nev,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+
+#ifdef WITH_MPI
+#ifdef SPLIT_COMM_MYSELF
+   call mpi_comm_split(MPI_COMM_WORLD, int(my_pcol,kind=MPI_KIND), int(my_prow,kind=MPI_KIND), &
+                       mpi_comm_rows, mpierr)
+   if (mpierr .ne. MPI_SUCCESS) then
+     call MPI_ERROR_STRING(mpierr, mpierr_string, mpi_string_length, mpierr2)
+     write(error_unit,*) "MPI ERROR occured during mpi_comm_split for row communicator: ", trim(mpierr_string)
+     stop 1
+   endif
+
+   call mpi_comm_split(MPI_COMM_WORLD, int(my_prow,kind=MPI_KIND), int(my_pcol,kind=MPI_KIND), &
+                       mpi_comm_cols, mpierr)
+   if (mpierr .ne. MPI_SUCCESS) then
+     call MPI_ERROR_STRING(mpierr,mpierr_string, mpi_string_length, mpierr2)
+     write(error_unit,*) "MPI ERROR occured during mpi_comm_split for col communicator: ", trim(mpierr_string)
+     stop 1
+   endif
+
+   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("mpi_comm_rows", int(mpi_comm_rows,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("mpi_comm_cols", int(mpi_comm_cols,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#else
+   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+#endif
+#ifdef TEST_GENERALIZED_EIGENPROBLEM
+   call e%set("blacs_context", int(my_blacs_ctxt,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+   call e%set("timings", 1_ik, error_elpa)
+   assert_elpa_ok(e%setup())
+
+#ifdef TEST_SOLVER_1STAGE
+   call e%set("solver", ELPA_SOLVER_1STAGE, error_elpa)
+#else
+   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
+#endif
+   assert_elpa_ok(error_elpa)
+
+   call e%set("gpu", TEST_GPU, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+#if TEST_QR_DECOMPOSITION == 1
+   call e%set("qr", 1_ik, error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+
+#ifdef WITH_OPENMP
+   max_threads=omp_get_max_threads()
+   call e%set("omp_threads", int(max_threads,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+#endif
+
+   if (myid == 0) print *, ""
+
+#ifdef TEST_ALL_KERNELS
+   do i = 0, elpa_option_cardinality(KERNEL_KEY)  ! kernels
+     if (TEST_GPU .eq. 0) then
+       kernel = elpa_option_enumerate(KERNEL_KEY, int(i,kind=c_int))
+       if (kernel .eq. ELPA_2STAGE_REAL_GPU) continue
+       if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) continue
+     endif
+#endif
+#ifdef TEST_KERNEL
+     kernel = TEST_KERNEL
+#endif
+
+#ifdef TEST_SOLVER_2STAGE
+     call e%set(KERNEL_KEY, kernel, error_elpa)
+#ifdef TEST_KERNEL
+     assert_elpa_ok(error_elpa)
+#else
+     if (error_elpa /= ELPA_OK) then
+       cycle
+     endif
+     ! actually used kernel might be different if forced via environment variables
+     call e%get(KERNEL_KEY, kernel, error_elpa)
+     assert_elpa_ok(error_elpa)
+#endif
+     if (myid == 0) then
+       print *, elpa_int_value_to_string(KERNEL_KEY, kernel) // " kernel"
+     endif
+#endif
+
+
+! print all parameters
+     call e%print_settings(error_elpa)
+     assert_elpa_ok(error_elpa)
+
+#ifdef TEST_ALL_KERNELS
+     call e%timer_start(elpa_int_value_to_string(KERNEL_KEY, kernel))
+#endif
+
+     ! The actual solve step
+#if defined(TEST_EIGENVECTORS)
+#if TEST_QR_DECOMPOSITION == 1
+     call e%timer_start("e%eigenvectors_qr()")
+#else
+     call e%timer_start("e%eigenvectors()")
+#endif
+#ifdef TEST_SCALAPACK_ALL
+     call solve_scalapack_all(na, a, sc_desc, ev, z)
+#elif TEST_SCALAPACK_PART
+     call solve_scalapack_part(na, a, sc_desc, nev, ev, z)
+     check_all_evals = .false. ! scalapack does not compute all eigenvectors
+#else
+     call e%eigenvectors(a, ev, z, error_elpa)
+#endif
+#if TEST_QR_DECOMPOSITION == 1
+     call e%timer_stop("e%eigenvectors_qr()")
+#else
+     call e%timer_stop("e%eigenvectors()")
+#endif
+#endif /* TEST_EIGENVECTORS  */
+
+#ifdef TEST_EIGENVALUES
+     call e%timer_start("e%eigenvalues()")
+     call e%eigenvalues(a, ev, error_elpa)
+     call e%timer_stop("e%eigenvalues()")
+#endif
+
+#if defined(TEST_SOLVE_TRIDIAGONAL)
+     call e%timer_start("e%solve_tridiagonal()")
+     call e%solve_tridiagonal(d, sd, z, error_elpa)
+     call e%timer_stop("e%solve_tridiagonal()")
+     ev(:) = d(:)
+#endif
+
+#if defined(TEST_CHOLESKY)
+     call e%timer_start("e%cholesky()")
+     call e%cholesky(a, error_elpa)
+     assert_elpa_ok(error_elpa)
+     call e%timer_stop("e%cholesky()")
+#endif
+
+#if defined(TEST_HERMITIAN_MULTIPLY)
+     call e%timer_start("e%hermitian_multiply()")
+     call e%hermitian_multiply('F','F', int(na,kind=c_int), a, b, int(na_rows,kind=c_int), &
+                               int(na_cols,kind=c_int), c, int(na_rows,kind=c_int),        &
+                               int(na_cols,kind=c_int), error_elpa)
+     call e%timer_stop("e%hermitian_multiply()")
+#endif
+
+#if defined(TEST_GENERALIZED_EIGENPROBLEM)
+     call e%timer_start("e%generalized_eigenvectors()")
+#if defined(TEST_GENERALIZED_DECOMP_EIGENPROBLEM)
+     call e%timer_start("is_already_decomposed=.false.")
+#endif
+     call e%generalized_eigenvectors(a, b, ev, z, .false., error_elpa)
+#if defined(TEST_GENERALIZED_DECOMP_EIGENPROBLEM)
+     call e%timer_stop("is_already_decomposed=.false.")
+     a = as
+     call e%timer_start("is_already_decomposed=.true.")
+     call e%generalized_eigenvectors(a, b, ev, z, .true., error_elpa)
+     call e%timer_stop("is_already_decomposed=.true.")
+#endif
+     call e%timer_stop("e%generalized_eigenvectors()")
+#endif
+
+     assert_elpa_ok(error_elpa)
+
+#ifdef TEST_ALL_KERNELS
+     call e%timer_stop(elpa_int_value_to_string(KERNEL_KEY, kernel))
+#endif
+
+     if (myid .eq. 0) then
+#ifdef TEST_ALL_KERNELS
+       call e%print_times(elpa_int_value_to_string(KERNEL_KEY, kernel))
+#else /* TEST_ALL_KERNELS */
+
+#if defined(TEST_EIGENVECTORS)
+#if TEST_QR_DECOMPOSITION == 1
+       call e%print_times("e%eigenvectors_qr()")
+#else
+       call e%print_times("e%eigenvectors()")
+#endif
+#endif
+#ifdef TEST_EIGENVALUES
+       call e%print_times("e%eigenvalues()")
+#endif
+#ifdef TEST_SOLVE_TRIDIAGONAL
+       call e%print_times("e%solve_tridiagonal()")
+#endif
+#ifdef TEST_CHOLESKY
+       call e%print_times("e%cholesky()")
+#endif
+#ifdef TEST_HERMITIAN_MULTIPLY
+       call e%print_times("e%hermitian_multiply()")
+#endif
+#ifdef TEST_GENERALIZED_EIGENPROBLEM
+      call e%print_times("e%generalized_eigenvectors()")
+#endif
+#endif /* TEST_ALL_KERNELS */
+     endif
+
+     if (do_test_analytic_eigenvalues) then
+       status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, &
+                                           my_prow, my_pcol, check_all_evals, .false.)
+       call check_status(status, myid)
+     endif
+
+     if (do_test_analytic_eigenvalues_eigenvectors) then
+       status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, &
+                                           my_prow, my_pcol, check_all_evals, .true.)
+       call check_status(status, myid)
+     endif
+
+     if(do_test_numeric_residual) then
+       status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, &
+                                                        np_rows,np_cols, my_prow, my_pcol)
+       call check_status(status, myid)
+     endif
+
+     if (do_test_frank_eigenvalues) then
+       status = check_correctness_eigenvalues_frank(na, ev, z, myid)
+       call check_status(status, myid)
+     endif
+
+     if (do_test_toeplitz_eigenvalues) then
+#if defined(TEST_EIGENVALUES) || defined(TEST_SOLVE_TRIDIAGONAL)
+       status = check_correctness_eigenvalues_toeplitz(na, diagonalElement, &
+                                                       subdiagonalElement, ev, z, myid)
+       call check_status(status, myid)
+#endif
+     endif
+
+     if (do_test_cholesky) then
+       status = check_correctness_cholesky(na, a, as, na_rows, sc_desc, myid )
+       call check_status(status, myid)
+     endif
+
+#ifdef TEST_HERMITIAN_MULTIPLY
+     if (do_test_hermitian_multiply) then
+       status = check_correctness_hermitian_multiply(na, a, b, c, na_rows, sc_desc, myid )
+       call check_status(status, myid)
+     endif
+#endif
+
+#ifdef TEST_GENERALIZED_EIGENPROBLEM
+     if(do_test_numeric_residual_generalized) then
+       status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, &
+                                                        np_cols, my_prow, &
+       my_pcol, bs)
+       call check_status(status, myid)
+     endif
+#endif
+
+
+#ifdef WITH_OPENMP
+     if (threads_caller .ne. omp_get_max_threads()) then
+       if (myid .eq. 0) then
+         print *, " ERROR! the number of OpenMP threads has not been restored correctly"
+       endif
+       status = 1
+     endif
+#endif
+     if (myid == 0) then
+       print *, ""
+     endif
+
+#ifdef TEST_ALL_KERNELS
+     a(:,:) = as(:,:)
+#if defined(TEST_MATRIX_TOEPLITZ) || defined(TEST_MATRIX_FRANK)
+     d = ds
+     sd = sds
+#endif
+   end do ! kernels
+#endif
+
+   call elpa_deallocate(e, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   deallocate(a)
+   deallocate(as)
+   deallocate(z)
+   deallocate(ev)
+#ifdef TEST_HERMITIAN_MULTIPLY
+   deallocate(b)
+   deallocate(c)
+#endif
+#if defined(TEST_MATRIX_TOEPLITZ) || defined(TEST_MATRIX_FRANK)
+   deallocate(d, ds)
+   deallocate(sd, sds)
+#endif
+#if defined(TEST_GENERALIZED_EIGENPROBLEM)
+  deallocate(b, bs)
+#endif
+
+#ifdef TEST_ALL_LAYOUTS
+   end do ! factors
+   end do ! layouts
+#endif
+   call elpa_uninit(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+   call exit(status)
+
+   contains
+
+     subroutine check_status(status, myid)
+       implicit none
+       TEST_INT_TYPE, intent(in) :: status, myid
+       TEST_INT_MPI_TYPE         :: mpierr
+       if (status /= 0) then
+         if (myid == 0) print *, "Result incorrect!"
+#ifdef WITH_MPI
+         call mpi_finalize(mpierr)
+#endif
+         call exit(status)
+       endif
+     end subroutine
+
+end program
diff -Nru elpa-2016.05.001/test/Fortran/test_multiple_objs.F90 elpa-2019.11.001/test/Fortran/test_multiple_objs.F90
--- elpa-2016.05.001/test/Fortran/test_multiple_objs.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/test_multiple_objs.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,379 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+! Define one of TEST_REAL or TEST_COMPLEX
+! Define one of TEST_SINGLE or TEST_DOUBLE
+! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
+! Define TEST_GPU \in [0, 1]
+! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
+
+#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
+error: define exactly one of TEST_REAL or TEST_COMPLEX
+#endif
+
+#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
+error: define exactly one of TEST_SINGLE or TEST_DOUBLE
+#endif
+
+#ifdef TEST_SINGLE
+#  define EV_TYPE real(kind=C_FLOAT)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_FLOAT)
+#  else
+#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
+#  endif
+#else
+#  define EV_TYPE real(kind=C_DOUBLE)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_DOUBLE)
+#  else
+#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
+#  endif
+#endif
+
+
+#ifdef TEST_REAL
+#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_REAL
+#else
+#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_COMPLEX
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+
+
+#include "assert.h"
+
+program test
+   use elpa
+
+   !use test_util
+   use test_setup_mpi
+   use test_prepare_matrix
+   use test_read_input_parameters
+   use test_blacs_infrastructure
+   use test_check_correctness
+   use test_analytic
+   use iso_fortran_env
+
+#ifdef HAVE_REDIRECT
+   use test_redirect
+#endif
+   implicit none
+
+   ! matrix dimensions
+   TEST_INT_TYPE                     :: na, nev, nblk
+
+   ! mpi
+   TEST_INT_TYPE                     :: myid, nprocs
+   TEST_INT_TYPE                     :: na_cols, na_rows  ! local matrix size
+   TEST_INT_TYPE                     :: np_cols, np_rows  ! number of MPI processes per column/row
+   TEST_INT_TYPE                     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
+   TEST_INT_TYPE                     :: ierr
+   TEST_INT_MPI_TYPE                 :: mpierr
+   ! blacs
+   character(len=1)                  :: layout
+   TEST_INT_TYPE                     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+
+   ! The Matrix
+   MATRIX_TYPE, allocatable    :: a(:,:), as(:,:)
+   ! eigenvectors
+   MATRIX_TYPE, allocatable    :: z(:,:)
+   ! eigenvalues
+   EV_TYPE, allocatable        :: ev(:)
+
+   TEST_INT_TYPE               :: status
+   integer(kind=c_int)         :: error_elpa
+
+   type(output_t)              :: write_to_file
+   class(elpa_t), pointer      :: e1, e2, e_ptr
+   class(elpa_autotune_t), pointer :: tune_state
+
+   TEST_INT_TYPE                     :: iter
+   character(len=5)            :: iter_string
+   TEST_INT_TYPE                     :: timings, debug, gpu
+
+   call read_input_parameters(na, nev, nblk, write_to_file)
+   call setup_mpi(myid, nprocs)
+#ifdef HAVE_REDIRECT
+#ifdef WITH_MPI
+   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+   call redirect_stdout(myid)
+#endif
+#endif
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+
+   layout = 'C'
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   np_rows = nprocs/np_cols
+   assert(nprocs == np_rows * np_cols)
+
+   if (myid == 0) then
+     print '((a,i0))', 'Matrix size: ', na
+     print '((a,i0))', 'Num eigenvectors: ', nev
+     print '((a,i0))', 'Blocksize: ', nblk
+#ifdef WITH_MPI
+     print '((a,i0))', 'Num MPI proc: ', nprocs
+     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
+     print '(a)',      'Process layout: ' // layout
+#endif
+     print *,''
+   endif
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, layout, &
+                         my_blacs_ctxt, my_prow, my_pcol)
+
+   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   allocate(a (na_rows,na_cols))
+   allocate(as(na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+   allocate(ev(na))
+
+   a(:,:) = 0.0
+   z(:,:) = 0.0
+   ev(:) = 0.0
+
+   call prepare_matrix_analytic(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol, print_times=.false.)
+   as(:,:) = a(:,:)
+
+   e1 => elpa_allocate(error_elpa)
+   !assert_elpa_ok(error_elpa)
+
+   call set_basic_params(e1, na, nev, na_rows, na_cols, my_prow, my_pcol)
+
+   call e1%set("timings",1, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e1%set("debug",1, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e1%set("gpu", 0, error_elpa)
+   assert_elpa_ok(error_elpa)
+   !call e1%set("max_stored_rows", 15, error_elpa)
+
+   assert_elpa_ok(e1%setup())
+
+   call e1%store_settings("initial_parameters.txt", error_elpa)
+   assert_elpa_ok(error_elpa)
+
+#ifdef WITH_MPI
+     ! barrier after store settings, file created from one MPI rank only, but loaded everywhere
+     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+#endif
+
+   ! try to load parameters into another object
+   e2 => elpa_allocate(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call set_basic_params(e2, na, nev, na_rows, na_cols, my_prow, my_pcol)
+   call e2%load_settings("initial_parameters.txt", error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   assert_elpa_ok(e2%setup())
+
+   ! test whether the user setting of e1 are correctly loade to e2
+   call e2%get("timings", int(timings,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e2%get("debug", int(debug,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+   call e2%get("gpu", int(gpu,kind=c_int), error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   if ((timings .ne. 1) .or. (debug .ne. 1) .or. (gpu .ne. 0)) then
+     print *, "Parameters not stored or loaded correctly. Aborting...", timings, debug, gpu
+     stop 1
+   endif
+
+   if(myid == 0) print *, "parameters of e1"
+   call e1%print_settings(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   if(myid == 0) print *, ""
+   if(myid == 0) print *, "parameters of e2"
+   call e2%print_settings(error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   e_ptr => e2
+
+
+   tune_state => e_ptr%autotune_setup(ELPA_AUTOTUNE_FAST, AUTOTUNE_DOMAIN, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+
+   iter=0
+   do while (e_ptr%autotune_step(tune_state, error_elpa))
+     assert_elpa_ok(error_elpa)
+ 
+     iter=iter+1
+     write(iter_string,'(I5.5)') iter
+     call e_ptr%print_settings(error_elpa)
+     assert_elpa_ok(error_elpa)
+
+     call e_ptr%store_settings("saved_parameters_"//trim(iter_string)//".txt", error_elpa)
+     assert_elpa_ok(error_elpa)
+
+     call e_ptr%timer_start("eigenvectors: iteration "//trim(iter_string))
+     call e_ptr%eigenvectors(a, ev, z, error_elpa)
+     assert_elpa_ok(error_elpa)
+     call e_ptr%timer_stop("eigenvectors: iteration "//trim(iter_string))
+
+     assert_elpa_ok(error_elpa)
+     if (myid .eq. 0) then
+       print *, ""
+       call e_ptr%print_times("eigenvectors: iteration "//trim(iter_string))
+     endif
+     status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
+                                         .true., .true., print_times=.false.)
+     a(:,:) = as(:,:)
+     call e_ptr%autotune_print_state(tune_state, error_elpa)
+     assert_elpa_ok(error_elpa)
+
+     call e_ptr%autotune_save_state(tune_state, "saved_state_"//trim(iter_string)//".txt", error_elpa)
+     assert_elpa_ok(error_elpa)
+#ifdef WITH_MPI
+     ! barrier after save state, file created from one MPI rank only, but loaded everywhere
+     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+#endif
+     call e_ptr%autotune_load_state(tune_state, "saved_state_"//trim(iter_string)//".txt", error_elpa)
+     assert_elpa_ok(error_elpa)
+
+   end do
+
+   ! set and print the autotuned-settings
+   call e_ptr%autotune_set_best(tune_state, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   if (myid .eq. 0) then
+     print *, "The best combination found by the autotuning:"
+     flush(output_unit)
+     call e_ptr%autotune_print_best(tune_state, error_elpa)
+     assert_elpa_ok(error_elpa)
+   endif
+   ! de-allocate autotune object
+   call elpa_autotune_deallocate(tune_state, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   if (myid .eq. 0) then
+     print *, "Running once more time with the best found setting..."
+   endif
+   call e_ptr%timer_start("eigenvectors: best setting")
+   call e_ptr%eigenvectors(a, ev, z, error_elpa)
+   assert_elpa_ok(error_elpa)
+
+   call e_ptr%timer_stop("eigenvectors: best setting")
+   assert_elpa_ok(error_elpa)
+   if (myid .eq. 0) then
+     print *, ""
+     call e_ptr%print_times("eigenvectors: best setting")
+   endif
+   status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
+                                       .true., .true., print_times=.false.)
+
+   call elpa_deallocate(e_ptr, error_elpa)
+   !assert_elpa_ok(error_elpa)
+
+   deallocate(a)
+   deallocate(as)
+   deallocate(z)
+   deallocate(ev)
+
+   call elpa_uninit(error_elpa)
+   !assert_elpa_ok(error_elpa)
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+
+   call exit(status)
+
+contains
+   subroutine set_basic_params(elpa, na, nev, na_rows, na_cols, my_prow, my_pcol)
+     implicit none
+     class(elpa_t), pointer      :: elpa
+     TEST_INT_TYPE, intent(in)   :: na, nev, na_rows, na_cols, my_prow, my_pcol
+
+     call elpa%set("na", int(na,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("nev", int(nev,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("nblk", int(nblk,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+
+#ifdef WITH_MPI
+     call elpa%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("process_row", int(my_prow,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+#endif
+   end subroutine
+
+end program
diff -Nru elpa-2016.05.001/test/Fortran/test_skewsymmetric.F90 elpa-2019.11.001/test/Fortran/test_skewsymmetric.F90
--- elpa-2016.05.001/test/Fortran/test_skewsymmetric.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/test_skewsymmetric.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,400 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+! Define one of TEST_REAL or TEST_COMPLEX
+! Define one of TEST_SINGLE or TEST_DOUBLE
+! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
+! Define TEST_GPU \in [0, 1]
+! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
+
+#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
+error: define exactly one of TEST_REAL or TEST_COMPLEX
+#endif
+
+#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
+error: define exactly one of TEST_SINGLE or TEST_DOUBLE
+#endif
+
+#ifdef TEST_SINGLE
+#  define EV_TYPE real(kind=C_FLOAT)
+#  define EV_TYPE_COMPLEX complex(kind=C_FLOAT_COMPLEX)
+#  define MATRIX_TYPE_COMPLEX complex(kind=C_FLOAT_COMPLEX)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_FLOAT)
+#  else
+#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
+#  endif
+#else
+#  define MATRIX_TYPE_COMPLEX complex(kind=C_DOUBLE_COMPLEX)
+#  define EV_TYPE_COMPLEX complex(kind=C_DOUBLE_COMPLEX)
+#  define EV_TYPE real(kind=C_DOUBLE)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_DOUBLE)
+#  else
+#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
+#  endif
+#endif
+
+#ifdef TEST_REAL
+#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_REAL
+#else
+#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_COMPLEX
+#endif
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+#include "assert.h"
+
+program test
+   use elpa
+
+   !use test_util
+   use test_setup_mpi
+   use test_prepare_matrix
+   use test_read_input_parameters
+   use test_blacs_infrastructure
+   use test_check_correctness
+   use precision_for_tests
+   use iso_fortran_env
+
+#ifdef HAVE_REDIRECT
+   use test_redirect
+#endif
+   implicit none
+
+   ! matrix dimensions
+   TEST_INT_TYPE                          :: na, nev, nblk
+
+   ! mpi
+   TEST_INT_TYPE                          :: myid, nprocs
+   TEST_INT_TYPE                          :: na_cols, na_rows  ! local matrix size
+   TEST_INT_TYPE                          :: np_cols, np_rows  ! number of MPI processes per column/row
+   TEST_INT_TYPE                          :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
+   TEST_INT_MPI_TYPE                      :: mpierr
+
+   ! blacs
+   character(len=1)                 :: layout
+   TEST_INT_TYPE                          :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+
+   ! The Matrix
+   MATRIX_TYPE, allocatable         :: a_skewsymmetric(:,:), as_skewsymmetric(:,:)
+   MATRIX_TYPE_COMPLEX, allocatable :: a_complex(:,:), as_complex(:,:)
+   ! eigenvectors
+   MATRIX_TYPE, allocatable         :: z_skewsymmetric(:,:)
+   MATRIX_TYPE_COMPLEX, allocatable :: z_complex(:,:)
+   ! eigenvalues
+   EV_TYPE, allocatable             :: ev_skewsymmetric(:), ev_complex(:)
+
+   TEST_INT_TYPE                    :: status, i, j
+   integer(kind=c_int)              :: error_elpa
+
+   type(output_t)                   :: write_to_file
+   class(elpa_t), pointer           :: e_complex, e_skewsymmetric
+           
+   call read_input_parameters(na, nev, nblk, write_to_file)
+   call setup_mpi(myid, nprocs)
+#ifdef HAVE_REDIRECT
+#ifdef WITH_MPI
+   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+   call redirect_stdout(myid)
+#endif
+#endif
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+! 
+   layout = 'C'
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+      if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   np_rows = nprocs/np_cols
+   assert(nprocs == np_rows * np_cols)
+
+   if (myid == 0) then
+     print '((a,i0))', 'Matrix size: ', na
+     print '((a,i0))', 'Num eigenvectors: ', nev
+     print '((a,i0))', 'Blocksize: ', nblk
+#ifdef WITH_MPI
+     print '((a,i0))', 'Num MPI proc: ', nprocs
+     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
+     print '(a)',      'Process layout: ' // layout
+#endif
+     print *,''
+   endif
+
+   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, &
+                             np_cols, layout, &
+                             my_blacs_ctxt, my_prow, my_pcol)
+
+   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   allocate(a_skewsymmetric (na_rows,na_cols))
+   allocate(as_skewsymmetric(na_rows,na_cols))
+   allocate(z_skewsymmetric (na_rows,2*na_cols))
+   allocate(ev_skewsymmetric(na))
+
+   a_skewsymmetric(:,:) = 0.0
+   z_skewsymmetric(:,:) = 0.0
+   ev_skewsymmetric(:) = 0.0
+
+   call prepare_matrix_random(na, myid, sc_desc, a_skewsymmetric, &
+   z_skewsymmetric(:,1:na_cols), as_skewsymmetric, is_skewsymmetric=1)
+   
+   !call MPI_BARRIER(MPI_COMM_WORLD, mpierr)  
+   as_skewsymmetric(:,:) = a_skewsymmetric(:,:)
+   
+
+   ! prepare the complex matrix for the "brute force" case
+   allocate(a_complex (na_rows,na_cols))
+   allocate(as_complex(na_rows,na_cols))
+   allocate(z_complex (na_rows,na_cols))
+   allocate(ev_complex(na))
+
+   a_complex(1:na_rows,1:na_cols) = 0.0
+   z_complex(1:na_rows,1:na_cols) = 0.0
+   as_complex(1:na_rows,1:na_cols) = 0.0
+   
+
+      do j=1, na_cols
+        do i=1,na_rows
+#ifdef TEST_DOUBLE
+          a_complex(i,j) = dcmplx(0.0, a_skewsymmetric(i,j))
+#endif
+#ifdef TEST_SINGLE
+          a_complex(i,j) = cmplx(0.0, a_skewsymmetric(i,j))
+#endif
+        enddo
+      enddo
+   
+
+
+   z_complex(1:na_rows,1:na_cols)  = a_complex(1:na_rows,1:na_cols)
+   as_complex(1:na_rows,1:na_cols) = a_complex(1:na_rows,1:na_cols)
+
+   ! first set up and solve the brute force problem
+   e_complex => elpa_allocate()
+   call set_basic_params(e_complex, na, nev, na_rows, na_cols, my_prow, my_pcol)
+
+   call e_complex%set("timings",1, error_elpa)
+
+   call e_complex%set("debug",1)
+   call e_complex%set("gpu", 0)
+   call e_complex%set("omp_threads", 8, error_elpa)
+
+   assert_elpa_ok(e_complex%setup())
+   call e_complex%set("solver", elpa_solver_2stage, error_elpa)
+
+   call e_complex%timer_start("eigenvectors: brute force as complex matrix")
+   call e_complex%eigenvectors(a_complex, ev_complex, z_complex, error_elpa)
+   call e_complex%timer_stop("eigenvectors: brute force as complex matrix")
+
+   if (myid .eq. 0) then
+     print *, ""
+     call e_complex%print_times("eigenvectors: brute force as complex matrix")
+   endif 
+#ifdef WITH_MPI
+     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+#endif     
+!      as_complex(:,:) = z_complex(:,:)
+#ifdef TEST_SINGLE
+     status = check_correctness_evp_numeric_residuals_complex_single(na, nev, as_complex, z_complex, ev_complex, sc_desc, &
+                                                    nblk, myid, np_rows,np_cols, my_prow, my_pcol)
+#else
+     status = check_correctness_evp_numeric_residuals_complex_double(na, nev, as_complex, z_complex, ev_complex, sc_desc, &
+                                                    nblk, myid, np_rows,np_cols, my_prow, my_pcol)
+#endif
+    status = 0
+    call check_status(status, myid)
+
+#ifdef WITH_MPI
+     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+#endif
+   ! now run the skewsymmetric case
+   e_skewsymmetric => elpa_allocate()
+   call set_basic_params(e_skewsymmetric, na, nev, na_rows, na_cols, my_prow, my_pcol)
+
+   call e_skewsymmetric%set("timings",1, error_elpa)
+
+   call e_skewsymmetric%set("debug",1)
+   call e_skewsymmetric%set("gpu", 0)
+   call e_skewsymmetric%set("omp_threads",8, error_elpa)
+
+   assert_elpa_ok(e_skewsymmetric%setup())
+   
+   call e_skewsymmetric%set("solver", elpa_solver_2stage, error_elpa)
+
+   call e_skewsymmetric%timer_start("eigenvectors: skewsymmetric ")
+   call e_skewsymmetric%skew_eigenvectors(a_skewsymmetric, ev_skewsymmetric, z_skewsymmetric, error_elpa)
+   call e_skewsymmetric%timer_stop("eigenvectors: skewsymmetric ")
+
+   if (myid .eq. 0) then
+     print *, ""
+     call e_skewsymmetric%print_times("eigenvectors: skewsymmetric")
+   endif
+   
+   ! check eigenvalues
+   do i=1, na
+     if (myid == 0) then
+#ifdef TEST_DOUBLE
+       if (abs(ev_complex(i)-ev_skewsymmetric(i))/abs(ev_complex(i)) .gt. 1e-10) then
+#endif
+#ifdef TEST_SINGLE
+       if (abs(ev_complex(i)-ev_skewsymmetric(i))/abs(ev_complex(i)) .gt. 1e-4) then
+#endif
+         print *,"ev: i=",i,ev_complex(i),ev_skewsymmetric(i)
+         status = 1
+     endif
+     endif
+   enddo
+
+
+!    call check_status(status, myid)
+   
+   z_complex(:,:) = 0
+   do j=1, na_cols
+     do i=1,na_rows
+#ifdef TEST_DOUBLE
+       z_complex(i,j) = dcmplx(z_skewsymmetric(i,j), z_skewsymmetric(i,na_cols+j))
+#endif
+#ifdef TEST_SINGLE
+       z_complex(i,j) = cmplx(z_skewsymmetric(i,j), z_skewsymmetric(i,na_cols+j))
+#endif
+     enddo
+   enddo
+#ifdef WITH_MPI
+   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+#endif
+
+#ifdef TEST_SINGLE
+   status = check_correctness_evp_numeric_residuals_ss_real_single(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
+                              sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
+#else
+   status = check_correctness_evp_numeric_residuals_ss_real_double(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
+                              sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
+#endif
+   
+#ifdef WITH_MPI
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+#endif
+   call elpa_deallocate(e_complex)
+   call elpa_deallocate(e_skewsymmetric)
+
+
+   !to do 
+   ! - check whether brute-force check_correctness_evp_numeric_residuals worsk (complex ev)
+   ! - invent a test for skewsymmetric residuals
+
+   deallocate(a_complex)
+   deallocate(as_complex)
+   deallocate(z_complex)
+   deallocate(ev_complex)
+
+   deallocate(a_skewsymmetric)
+   deallocate(as_skewsymmetric)
+   deallocate(z_skewsymmetric)
+   deallocate(ev_skewsymmetric)
+   call elpa_uninit()
+
+
+
+#ifdef WITH_MPI
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+#endif
+
+   call exit(status)
+
+contains
+   subroutine set_basic_params(elpa, na, nev, na_rows, na_cols, my_prow, my_pcol)
+     implicit none
+     class(elpa_t), pointer      :: elpa
+     TEST_INT_TYPE, intent(in)         :: na, nev, na_rows, na_cols, my_prow, my_pcol
+
+     call elpa%set("na", int(na,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("nev", int(nev,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("nblk", int(nblk,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+
+#ifdef WITH_MPI
+     call elpa%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("process_row", int(my_prow,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+#endif
+   end subroutine
+   subroutine check_status(status, myid)
+     implicit none
+     TEST_INT_TYPE, intent(in) :: status, myid
+     TEST_INT_MPI_TYPE         :: mpierr
+     if (status /= 0) then
+       if (myid == 0) print *, "Result incorrect!"
+#ifdef WITH_MPI
+       call mpi_finalize(mpierr)
+#endif
+       call exit(status)
+     endif
+   end subroutine
+end program
diff -Nru elpa-2016.05.001/test/Fortran/test_split_comm.F90 elpa-2019.11.001/test/Fortran/test_split_comm.F90
--- elpa-2016.05.001/test/Fortran/test_split_comm.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/Fortran/test_split_comm.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,340 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+! Define one of TEST_REAL or TEST_COMPLEX
+! Define one of TEST_SINGLE or TEST_DOUBLE
+! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
+! Define TEST_GPU \in [0, 1]
+! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
+
+#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
+error: define exactly one of TEST_REAL or TEST_COMPLEX
+#endif
+
+#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
+error: define exactly one of TEST_SINGLE or TEST_DOUBLE
+#endif
+
+#ifdef TEST_SINGLE
+#  define EV_TYPE real(kind=C_FLOAT)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_FLOAT)
+#  else
+#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
+#  endif
+#else
+#  define EV_TYPE real(kind=C_DOUBLE)
+#  ifdef TEST_REAL
+#    define MATRIX_TYPE real(kind=C_DOUBLE)
+#  else
+#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
+#  endif
+#endif
+
+
+#ifdef TEST_REAL
+#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_REAL
+#else
+#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_COMPLEX
+#endif
+
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+#include "assert.h"
+
+program test
+   use elpa
+
+   !use test_util
+   use test_setup_mpi
+   use test_prepare_matrix
+   use test_read_input_parameters
+   use test_blacs_infrastructure
+   use test_check_correctness
+   use test_analytic
+   use iso_fortran_env
+
+#ifdef HAVE_REDIRECT
+   use test_redirect
+#endif
+   implicit none
+
+   ! matrix dimensions
+   TEST_INT_TYPE                     :: na, nev, nblk
+   TEST_INT_TYPE                     :: num_groups, group_size, color, key
+
+   ! mpi
+   TEST_INT_TYPE                     :: myid, nprocs
+   TEST_INT_TYPE                     :: na_cols, na_rows  ! local matrix size
+   TEST_INT_TYPE                     :: np_cols, np_rows  ! number of MPI processes per column/row
+   TEST_INT_TYPE                     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
+   TEST_INT_MPI_TYPE                 :: mpierr, ierr,mpi_sub_commMPI, myidMPI, nprocsMPI, colorMPI, keyMPI, &
+                                        myid_subMPI, nprocs_subMPI
+   TEST_INT_TYPE                     :: mpi_sub_comm
+   TEST_INT_TYPE                     :: myid_sub, nprocs_sub
+
+   ! blacs
+   character(len=1)            :: layout
+   TEST_INT_TYPE                     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+
+   ! The Matrix
+   MATRIX_TYPE, allocatable    :: a(:,:), as(:,:)
+   ! eigenvectors
+   MATRIX_TYPE, allocatable    :: z(:,:)
+   ! eigenvalues
+   EV_TYPE, allocatable        :: ev(:)
+
+   TEST_INT_TYPE               :: status
+   integer(kind=c_int)         :: error_elpa
+
+   type(output_t)              :: write_to_file
+   class(elpa_t), pointer      :: e
+
+   TEST_INT_TYPE                     :: iter
+   character(len=5)            :: iter_string
+
+   status = 0
+#ifdef WITH_MPI
+
+   call read_input_parameters(na, nev, nblk, write_to_file)
+   !call setup_mpi(myid, nprocs)
+   call mpi_init(mpierr)
+   call mpi_comm_rank(mpi_comm_world, myidMPI,mpierr)
+   call mpi_comm_size(mpi_comm_world, nprocsMPI,mpierr)
+   myid = int(myidMPI,kind=BLAS_KIND)
+   nprocs = int(nprocsMPI,kind=BLAS_KIND)
+
+   if((mod(nprocs, 4) == 0) .and. (nprocs > 4)) then
+     num_groups = 4
+   else if(mod(nprocs, 3) == 0) then
+     num_groups = 3
+   else if(mod(nprocs, 2) == 0) then
+     num_groups = 2
+   else
+     num_groups = 1
+   endif
+
+   group_size = nprocs / num_groups
+
+   if(num_groups * group_size .ne. nprocs) then 
+     print *, "Something went wrong before splitting the communicator"
+     stop 1
+   else
+     if(myid == 0) then
+       print '((a,i0,a,i0))', "The test will split the global communicator into ", num_groups, " groups of size ", group_size
+     endif
+   endif
+
+   ! each group of processors will have the same color
+   color = mod(myid, num_groups)
+   ! this will determine the myid in each group
+   key = myid/num_groups
+   !split the communicator
+   colorMPI=int(color,kind=MPI_KIND)
+   keyMPI = int(key, kind=MPI_KIND)
+   call mpi_comm_split(mpi_comm_world, colorMPI, keyMPI, mpi_sub_commMPI, mpierr)
+   mpi_sub_comm = int(mpi_sub_commMPI,kind=BLAS_KIND)
+   color = int(colorMPI,kind=BLAS_KIND)
+   key = int(keyMPI,kind=BLAS_KIND)
+   if(mpierr .ne. MPI_SUCCESS) then 
+     print *, "communicator splitting not successfull", mpierr
+     stop 1
+   endif
+
+   call mpi_comm_rank(mpi_sub_commMPI, myid_subMPI, mpierr)
+   call mpi_comm_size(mpi_sub_commMPI, nprocs_subMPI, mpierr)
+   myid_sub = int(myid_subMPI,kind=BLAS_KIND)
+   nprocs_sub = int(nprocs_subMPI,kind=BLAS_KIND)
+
+   !print *, "glob ", myid, nprocs, ", loc ", myid_sub, nprocs_sub, ", color ", color, ", key ", key
+
+   if((mpierr .ne. MPI_SUCCESS) .or. (nprocs_sub .ne. group_size) .or. (myid_sub >= group_size)) then
+     print *, "something wrong with the sub communicators"
+     stop 1
+   endif
+
+
+#ifdef HAVE_REDIRECT
+   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
+   call redirect_stdout(myid)
+#endif
+
+   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
+     print *, "ELPA API version not supported"
+     stop 1
+   endif
+
+   layout = 'C'
+   do np_cols = NINT(SQRT(REAL(nprocs_sub))),2,-1
+      if(mod(nprocs_sub,np_cols) == 0 ) exit
+   enddo
+   np_rows = nprocs_sub/np_cols
+   assert(nprocs_sub == np_rows * np_cols)
+   assert(nprocs == np_rows * np_cols * num_groups)
+
+   if (myid == 0) then
+     print '((a,i0))', 'Matrix size: ', na
+     print '((a,i0))', 'Num eigenvectors: ', nev
+     print '((a,i0))', 'Blocksize: ', nblk
+     print '(a)',      'Process layout: ' // layout
+     print *,''
+   endif
+   if (myid_sub == 0) then
+     print '(4(a,i0))','GROUP ', color, ': Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs_sub
+   endif
+
+   ! USING the subcommunicator
+   call set_up_blacsgrid(int(mpi_sub_comm,kind=BLAS_KIND), np_rows, np_cols, layout, &
+                         my_blacs_ctxt, my_prow, my_pcol)
+
+   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
+                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
+
+   allocate(a (na_rows,na_cols))
+   allocate(as(na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+   allocate(ev(na))
+
+   a(:,:) = 0.0
+   z(:,:) = 0.0
+   ev(:) = 0.0
+
+   !call prepare_matrix_analytic(na, a, nblk, myid_sub, np_rows, np_cols, my_prow, my_pcol, print_times=.false.)
+   call prepare_matrix_random(na, myid_sub, sc_desc, a, z, as)
+   as(:,:) = a(:,:)
+
+   e => elpa_allocate(error_elpa)
+   call set_basic_params(e, na, nev, na_rows, na_cols, mpi_sub_comm, my_prow, my_pcol)
+
+   call e%set("timings",1, error_elpa)
+
+   call e%set("debug",1, error_elpa)
+   call e%set("gpu", 0, error_elpa)
+   !call e%set("max_stored_rows", 15, error_elpa)
+
+   assert_elpa_ok(e%setup())
+
+
+
+!   if(myid == 0) print *, "parameters of e"
+!   call e%print_all_parameters()
+!   if(myid == 0) print *, ""
+
+
+   call e%timer_start("eigenvectors")
+   call e%eigenvectors(a, ev, z, error_elpa)
+   call e%timer_stop("eigenvectors")
+
+   assert_elpa_ok(error_elpa)
+
+   !status = check_correctness_analytic(na, nev, ev, z, nblk, myid_sub, np_rows, np_cols, my_prow, my_pcol, &
+    !                   .true., .true., print_times=.false.)
+   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid_sub, &
+                    np_rows,np_cols, my_prow, my_pcol)
+   if (status /= 0) &
+     print *, "processor ", myid, ": Result incorrect for processor group ", color
+
+   if (myid .eq. 0) then
+     print *, "Showing times of one goup only"
+     call e%print_times("eigenvectors")
+   endif
+
+   call elpa_deallocate(e, error_elpa)
+
+   deallocate(a)
+   deallocate(as)
+   deallocate(z)
+   deallocate(ev)
+
+   call elpa_uninit(error_elpa)
+
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+
+#endif
+   call exit(status)
+
+contains
+   subroutine set_basic_params(elpa, na, nev, na_rows, na_cols, communicator, my_prow, my_pcol)
+     use iso_c_binding
+     implicit none
+     class(elpa_t), pointer      :: elpa
+     TEST_INT_TYPE, intent(in)         :: na, nev, na_rows, na_cols, my_prow, my_pcol, communicator
+
+#ifdef WITH_MPI
+     call elpa%set("na", int(na,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("nev", int(nev,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("nblk", int(nblk,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+
+     call elpa%set("mpi_comm_parent", int(communicator,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("process_row", int(my_prow,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+     call elpa%set("process_col", int(my_pcol,kind=c_int), error_elpa)
+     assert_elpa_ok(error_elpa)
+#endif
+   end subroutine
+
+end program
diff -Nru elpa-2016.05.001/test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa-2019.11.001/test/fortran_test_programs/elpa_test_programs_print_headers.X90
--- elpa-2016.05.001/test/fortran_test_programs/elpa_test_programs_print_headers.X90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/elpa_test_programs_print_headers.X90	1970-01-01 00:00:00.000000000 +0000
@@ -1,194 +0,0 @@
-#if 0
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-#endif
-
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-#ifndef WITH_MPI
-   if (myid .eq. 0) then
-     print *,"This version of ELPA does not support MPI parallelisation"
-     print *,"For MPI support re-build ELPA with appropiate flags"
-     print *," "
-   endif
-#endif
-
-#ifdef WITH_MPI
-    call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-#endif
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-#ifdef WITH_MPI
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-#endif
-     call redirect_stdout(myid)
-   endif
-#endif
-
-#ifndef ELPA1
-
-   if (myid .eq. 0) then
-      print *," "
-      print *,"This ELPA2 is build with"
-#if DATATYPE == REAL
-
-#ifdef HAVE_AVX2
-
-#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
-      print *,"AVX2 optimized kernel (2 blocking) for real matrices"
-#endif
-#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
-      print *,"AVX2 optimized kernel (4 blocking) for real matrices"
-#endif
-#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
-      print *,"AVX2 optimized kernel (6 blocking) for real matrices"
-#endif
-
-#else /* no HAVE_AVX2 */
-
-#ifdef HAVE_AVX
-
-#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
-      print *,"AVX optimized kernel (2 blocking) for real matrices"
-#endif
-#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
-      print *,"AVX optimized kernel (4 blocking) for real matrices"
-#endif
-#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
-      print *,"AVX optimized kernel (6 blocking) for real matrices"
-#endif
-
-#endif
-
-#endif /* HAVE_AVX2 */
-
-
-#ifdef WITH_REAL_GENERIC_KERNEL
-     print *,"GENERIC kernel for real matrices"
-#endif
-#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
-     print *,"GENERIC SIMPLE kernel for real matrices"
-#endif
-#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
-     print *,"SSE ASSEMBLER kernel for real matrices"
-#endif
-#ifdef WITH_REAL_BGP_KERNEL
-     print *,"BGP kernel for real matrices"
-#endif
-#ifdef WITH_REAL_BGQ_KERNEL
-     print *,"BGQ kernel for real matrices"
-#endif
-
-#endif /* DATATYPE == REAL */
-
-#if DATATYPE == COMPLEX
-
-#ifdef HAVE_AVX2
-
-#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
-      print *,"AVX2 optimized kernel (2 blocking) for complex matrices"
-#endif
-#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
-      print *,"AVX2 optimized kernel (1 blocking) for complex matrices"
-#endif
-
-#else /* no HAVE_AVX2 */
-
-#ifdef HAVE_AVX
-
-#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
-      print *,"AVX optimized kernel (2 blocking) for complex matrices"
-#endif
-#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
-      print *,"AVX optimized kernel (1 blocking) for complex matrices"
-#endif
-
-#endif
-
-#endif /* HAVE_AVX2 */
-
-
-#ifdef WITH_COMPLEX_GENERIC_KERNEL
-     print *,"GENERIC kernel for complex matrices"
-#endif
-#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-     print *,"GENERIC SIMPLE kernel for complex matrices"
-#endif
-#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
-     print *,"SSE ASSEMBLER kernel for complex matrices"
-#endif
-
-#endif /* DATATYPE == COMPLEX */
-
-   endif
-#endif /* ELPA1 */
-
-   if (write_to_file%eigenvectors) then
-     if (myid .eq. 0) print *,"Writing Eigenvectors to files"
-   endif
-
-   if (write_to_file%eigenvalues) then
-     if (myid .eq. 0) print *,"Writing Eigenvalues to files"
-   endif
-
-
diff -Nru elpa-2016.05.001/test/fortran_test_programs/read_real.F90 elpa-2019.11.001/test/fortran_test_programs/read_real.F90
--- elpa-2016.05.001/test/fortran_test_programs/read_real.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/read_real.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,432 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 1 real case library.
-!> This program can read a matrix from an ascii
-!> file and computes then the Eigenvectors.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-program read_real
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!-------------------------------------------------------------------------------
-
-   use precision
-   use ELPA1
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-#ifdef HAVE_REDIRECT
-   use redirect
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-   use timings
-#endif
-   implicit none
-   include 'mpif.h'
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik), parameter :: nblk = 16
-
-   !-------------------------------------------------------------------------------
-   !  Local Variables
-
-   integer(kind=ik)            :: na, nev
-
-   integer(kind=ik)            :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)            :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)            :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol, lenarg
-
-   integer, external           :: numroc
-
-   real(kind=rk)               :: err, errmax
-   real(kind=rk), allocatable  :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   character*256               :: filename
-#ifdef WITH_OPENMP
-   integer(kind=iK)            :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-
-#ifndef WITH_OPENMP
-   call mpi_init(mpierr)
-#else
-   required_mpi_thread_level = MPI_THREAD_MULTIPLE
-
-   call mpi_init_thread(required_mpi_thread_level,     &
-                        provided_mpi_thread_level, mpierr)
-
-   if (required_mpi_thread_level .ne. provided_mpi_thread_level) then
-     write(error_unit,*) "MPI ERROR: MPI_THREAD_MULTIPLE is not provided on this system"
-     write(error_unit,*) "           only ", mpi_thread_level_name(provided_mpi_thread_level), " is available"
-     call exit(77)
-   endif
-
-#endif
-   call mpi_comm_rank(mpi_comm_world,myid,mpierr)
-   call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
-
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-     endif
-     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-     call redirect_stdout(myid)
-   endif
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Get the name of the input file containing the matrix and open input file
-   ! Please note:
-   ! get_command_argument is a FORTRAN 2003 intrinsic which may not be implemented
-   ! for every Fortran compiler!!!
-
-   if(myid==0) then
-      call get_command_argument(1,filename,lenarg,info)
-      if(info/=0) then
-         write(error_unit,*) 'Usage: test_real matrix_file'
-         call mpi_abort(mpi_comm_world,1,mpierr)
-      endif
-      open(10,file=filename,action='READ',status='OLD',iostat=info)
-      if(info/=0) then
-         write(error_unit,*) 'Error: Unable to open ',trim(filename)
-         call mpi_abort(mpi_comm_world,1,mpierr)
-      endif
-   endif
-   call mpi_barrier(mpi_comm_world, mpierr) ! Just for safety
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   my_blacs_ctxt = mpi_comm_world
-   call BLACS_Gridinit( my_blacs_ctxt, 'C', np_rows, np_cols )
-   call BLACS_Gridinfo( my_blacs_ctxt, nprow, npcol, my_prow, my_pcol )
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators
-
-   call get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                               mpi_comm_rows, mpi_comm_cols)
-
-   ! Read matrix size
-   if(myid==0) read(10,*) na
-   call mpi_bcast(na, 1, mpi_integer, 0, mpi_comm_world, mpierr)
-
-   ! Quick check for plausibility
-   if(na<=0 .or. na>10000000) then
-      if(myid==0) write(error_unit,*) 'Illegal value for matrix size: ',na
-      call mpi_finalize(mpierr)
-      stop
-   endif
-   if(myid==0) print *,'Matrix size: ',na
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
-   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
-
-   ! Set up a scalapack descriptor for the checks below.
-   ! For ELPA the following restrictions hold:
-   ! - block sizes in both directions must be identical (args 4+5)
-   ! - first row and column of the distributed matrix must be on row/col 0/0 (args 6+7)
-
-   call descinit( sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info )
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   !-------------------------------------------------------------------------------
-   ! Read matrix
-
-   call read_matrix(10, na, a, ubound(a,1), nblk, my_prow, my_pcol, np_rows, np_cols)
-   if(myid==0) close(10)
-
-   nev = na ! all eigenvaules
-
-   ! Save original matrix A for later accuracy checks
-
-   as = a
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-   call solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                       mpi_comm_rows, mpi_comm_cols)
-
-   if(myid == 0) print *,'Time tridiag_real :',time_evp_fwd
-   if(myid == 0) print *,'Time solve_tridi  :',time_evp_solve
-   if(myid == 0) print *,'Time trans_ev_real:',time_evp_back
-
-   if(myid == 0) then
-      do i=1,nev
-         print '(i6,g25.15)',i,ev(i)
-      enddo
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-
-   deallocate(a)
-   allocate(tmp1(na_rows,na_cols))
-
-   ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
-
-   ! tmp1 =  A * Z
-   call pdgemm('N','N',na,nev,na,1.d0,as,1,1,sc_desc, &
-           z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc)
-
-   deallocate(as)
-   allocate(tmp2(na_rows,na_cols))
-
-   ! tmp2 = Zi*EVi
-   tmp2(:,:) = z(:,:)
-   do i=1,nev
-      call pdscal(na,ev(i),tmp2,1,i,sc_desc,1)
-   enddo
-
-   !  tmp1 = A*Zi - Zi*EVi
-   tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-   ! Get maximum norm of columns of tmp1
-   errmax = 0
-   do i=1,nev
-      err = 0
-      call pdnrm2(na,err,tmp1,1,i,sc_desc,1)
-      errmax = max(errmax, err)
-   enddo
-
-   ! Get maximum error norm over all processors
-   err = errmax
-   call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-   if(myid==0) print *
-   if(myid==0) print *,'Error Residual     :',errmax
-
-   ! 2. Eigenvector orthogonality
-
-   ! tmp1 = Z**T * Z
-   tmp1 = 0
-   call pdgemm('T','N',nev,nev,na,1.d0,z,1,1,sc_desc, &
-           z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc)
-   ! Initialize tmp2 to unit matrix
-   tmp2 = 0
-   call pdlaset('A',nev,nev,0.d0,1.d0,tmp2,1,1,sc_desc)
-
-   ! tmp1 = Z**T * Z - Unit Matrix
-   tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-   ! Get maximum error (max abs value in tmp1)
-   err = maxval(abs(tmp1))
-   call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-   if(myid==0) print *,'Error Orthogonality:',errmax
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-
-end
-
-!-------------------------------------------------------------------------------
-subroutine read_matrix(iunit, na, a, lda, nblk, my_prow, my_pcol, np_rows, np_cols)
-
-   implicit none
-   include 'mpif.h'
-
-   integer, intent(in) :: iunit, na, lda, nblk, my_prow, my_pcol, np_rows, np_cols
-   real*8, intent(out) :: a(lda, *)
-
-   integer i, j, lr, lc, myid, mpierr
-   integer, allocatable :: l_row(:), l_col(:)
-
-   real*8, allocatable :: col(:)
-
-   ! allocate and set index arrays
-
-   allocate(l_row(na))
-   allocate(l_col(na))
-
-   ! Mapping of global rows/cols to local
-
-   l_row(:) = 0
-   l_col(:) = 0
-
-   lr = 0 ! local row counter
-   lc = 0 ! local column counter
-
-   do i = 1, na
-
-     if( MOD((i-1)/nblk,np_rows) == my_prow) then
-       ! row i is on local processor
-       lr = lr+1
-       l_row(i) = lr
-     endif
-
-     if( MOD((i-1)/nblk,np_cols) == my_pcol) then
-       ! column i is on local processor
-       lc = lc+1
-       l_col(i) = lc
-     endif
-
-   enddo
-
-   call mpi_comm_rank(mpi_comm_world,myid,mpierr)
-   allocate(col(na))
-
-   do i=1,na
-      if(myid==0) read(iunit,*) col(1:i)
-      call mpi_bcast(col,i,MPI_REAL8,0,MPI_COMM_WORLD,mpierr)
-      if(l_col(i) > 0) then
-         do j=1,i
-            if(l_row(j)>0) a(l_row(j),l_col(i)) = col(j)
-         enddo
-      endif
-      if(l_row(i) > 0) then
-         do j=1,i-1
-            if(l_col(j)>0) a(l_row(i),l_col(j)) = col(j)
-         enddo
-      endif
-   enddo
-
-   deallocate(l_row, l_col, col)
-
-end subroutine read_matrix
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 elpa-2019.11.001/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,412 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 complex case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The complex ELPA 2 kernel is set in this program via
-!> the API call. However, this can be overriden by setting
-!> the environment variable "COMPLEX_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_complex2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - COMPLEX version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!-------------------------------------------------------------------------------
-
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use output_types
-
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)              :: nblk
-   integer(kind=ik)              :: na, nev
-   integer(kind=ik)              :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)              :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external             :: numroc
-
-   real(kind=rk), allocatable    :: ev(:), xr(:,:)
-
-   complex(kind=ck), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
-
-   complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-   integer(kind=ik)              :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)              :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   type(output_t)                :: write_to_file
-   logical                       :: success
-   character(len=8)              :: task_suffix
-   integer(kind=ik)              :: j
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-   STATUS = 0
-
-#define DATATYPE COMPLEX
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - COMPLEX version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how to determine the ELPA2 kernel with"
-      print *, "an api call. Note, however, that setting the kernel via"
-      print *, "an environment variable will always take precedence over"
-      print *, "everything else! "
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-      print *, " The settings are: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE"
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-   allocate(xr(na_rows,na_cols))
-
-   call prepare_matrix(na, myid, sc_desc, iseed, xr, a, z, as)
-
-   deallocate(xr)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called a kernel specification in the API
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                                 na_cols, &
-                                 mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
-#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
-                                 COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
-#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-#ifdef  WITH_COMPLEX_GENERIC_KERNEL
-                                 COMPLEX_ELPA_KERNEL_GENERIC)
-#endif
-
-#ifdef  WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
-                                 COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
-#endif
-
-#ifdef  WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
-                                 COMPLEX_ELPA_KERNEL_SSE)
-#endif
-
-#ifdef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
-
-#ifdef  WITH_COMPLEX_SSE_BLOCK2_KERNEL
-                                 COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
-#else
-#ifdef  WITH_COMPLEX_SSE_BLOCK1_KERNEL
-                                 COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
-#endif
-#endif
-
-#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
-                                 COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
-#else
-#ifdef  WITH_COMPLEX_AVX_BLOCK1_KERNEL
-                                 COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
-#endif
-#endif
-
-#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-#ifdef  WITH_COMPLEX_SSE_BLOCK1_KERNEL
-                                 COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
-#endif
-
-#ifdef  WITH_COMPLEX_SSE_BLOCK2_KERNEL
-                                 COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
-#endif
-
-#ifdef  WITH_COMPLEX_AVX_BLOCK1_KERNEL
-                                 COMPLEX_ELPA_KERNEL_AVX_BLOCK1)
-#endif
-
-#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
-                                 COMPLEX_ELPA_KERNEL_AVX_BLOCK2)
-#endif
-
-#endif  /*   WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file%eigenvectors) then
-     write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_complex2_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="Eigenvalues_complex2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_complex2_default_kernel.F90 elpa-2019.11.001/test/fortran_test_programs/test_complex2_default_kernel.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_complex2_default_kernel.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_complex2_default_kernel.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,360 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 complex case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The complex ELPA 2 kernel is set as the default kernel.
-!> However, this can be overriden by setting
-!> the environment variable "COMPLEX_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_complex2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - COMPLEX version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use output_types
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)              :: nblk
-   integer(kind=ik)              :: na, nev
-
-   integer(kind=ik)              :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)              :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external             :: numroc
-
-   real(kind=rk), allocatable    :: ev(:), xr(:,:)
-
-   complex(kind=ck), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
-
-   complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-   integer(kind=ik)              :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)              :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   type(output_t)                :: write_to_file
-   logical                       :: success
-   character(len=8)              :: task_suffix
-   integer(kind=ik)              :: j
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#define DATATYPE COMPLEX
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - COMPLEX version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how ELPA2 chooses a default kernel,"
-#ifdef HAVE_ENVIRONMENT_CHECKING
-      print *, "or takes the kernel defined in the environment variable,"
-#endif
-      print *, "since the ELPA API call does not contain any kernel specification"
-      print *
-      print *, " The settings are: ",trim(get_actual_complex_kernel_name())," as complex kernel"
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-
-
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-   allocate(xr(na_rows,na_cols))
-
-   call prepare_matrix(na, myid, sc_desc, iseed, xr, a, z, as)
-
-   deallocate(xr)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called without any kernel specification in the API,
-   ! furthermore, if the environment variable is not set, the
-   ! default kernel is called. Otherwise, the kernel defined in the
-   ! environment variable
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                                 na_cols, &
-                                 mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file%eigenvectors) then
-     write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_complex2_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="Eigenvalues_complex2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_complex2.F90 elpa-2019.11.001/test/fortran_test_programs/test_complex2.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_complex2.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_complex2.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,333 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 complex case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The complex ELPA 2 kernel is set as the default kernel.
-!> However, this can be overriden by setting
-!> the environment variable "COMPLEX_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_complex2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - COMPLEX version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use output_types
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)              :: nblk
-   integer(kind=ik)              :: na, nev
-
-   integer(kind=ik)              :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)              :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external             :: numroc
-
-   complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-   real(kind=rk), allocatable    :: ev(:), xr(:,:)
-
-   complex(kind=ck), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
-
-
-   integer(kind=ik)              :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)              :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   type(output_t)                :: write_to_file
-   logical                       :: success
-   character(len=8)              :: task_suffix
-   integer(kind=ik)              :: j
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-      !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#define DATATYPE COMPLEX
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - COMPLEX version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-   allocate(xr(na_rows,na_cols))
-
-   call prepare_matrix(na, myid, sc_desc, iseed, xr, a, z, as)
-
-   deallocate(xr)
-
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                                 na_cols, &
-                                 mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file%eigenvectors) then
-     write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_complex2_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="Eigenvalues_complex2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_complex.F90 elpa-2019.11.001/test/fortran_test_programs/test_complex.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_complex.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_complex.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,333 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 1 complex case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-program test_complex
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - COMPLEX version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef HAVE_REDIRECT
-   use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
-  use output_types
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)              :: nblk
-   integer(kind=ik)              :: na, nev
-
-   integer(kind=ik)              :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)              :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   real(kind=rk), allocatable    :: ev(:), xr(:,:)
-
-   complex(kind=ck), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
-
-   complex(kind=ck), parameter   :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-
-   integer(kind=ik)              :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)              :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   type(output_t)                :: write_to_file
-   logical                       :: success
-   character(len=8)              :: task_suffix
-   integer(kind=ik)              :: j
-
-   success = .true.
-   ! read input parameters if they are provided
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#define DATATYPE COMPLEX
-#define ELPA1
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - COMPLEX version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   allocate(xr(na_rows,na_cols))
-
-   call prepare_matrix(na, myid, sc_desc, iseed, xr, a, z, as)
-
-   deallocate(xr)
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering one-step ELPA solver ... '
-     print *
-   end if
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_complex_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                               na_cols, mpi_comm_rows, mpi_comm_cols)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_complex produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-   if (myid==0) then
-     print '(a)','| One-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time tridiag_complex  :',time_evp_fwd
-   if(myid == 0) print *,'Time solve_tridi      :',time_evp_solve
-   if(myid == 0) print *,'Time trans_ev_complex :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above):',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file%eigenvectors) then
-     write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_complex_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="Eigenvalues_complex_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 elpa-2019.11.001/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,430 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The complex ELPA 2 kernel is set in this program via
-!> the API call. However, this can be overriden by setting
-!> the environment variable "REAL_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_real2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use output_types
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   type(output_t)             :: write_to_file
-   logical                    :: success
-   character(len=8)           :: task_suffix
-   integer(kind=ik)           :: j
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#define DATATYPE REAL
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how to determine the ELPA2 kernel with"
-      print *, "an api call. Note, however, that setting the kernel via"
-      print *, "an environment variable will always take precedence over"
-      print *, "everything else! "
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-      print *, " The settings are: REAL_ELPA_KERNEL_GENERIC_SIMPLE"
-      print *
-
-
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called with a kernel specification in the API
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                              na_cols, &
-                              mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
-#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
-                             REAL_ELPA_KERNEL_GENERIC_SIMPLE)
-#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
-
-#ifdef WITH_REAL_GENERIC_KERNEL
-                              REAL_ELPA_KERNEL_GENERIC)
-#endif
-
-#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
-                              REAL_ELPA_KERNEL_GENERIC_SIMPLE)
-#endif
-
-#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
-                              REAL_ELPA_KERNEL_SSE)
-#endif
-#ifdef WITH_ONE_SPECIFIC_REAL_KERNEL
-
-#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
-                              REAL_ELPA_KERNEL_SSE_BLOCK6)
-#else
-#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
-                              REAL_ELPA_KERNEL_SSE_BLOCK4)
-#else
-#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
-                              REAL_ELPA_KERNEL_SSE_BLOCK2)
-#endif
-#endif
-#endif
-#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
-                              REAL_ELPA_KERNEL_AVX_BLOCK6)
-#else
-#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
-                              REAL_ELPA_KERNEL_AVX_BLOCK4)
-#else
-#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
-                              REAL_ELPA_KERNEL_AVX_BLOCK2)
-#endif
-#endif
-#endif
-
-#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
-
-#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
-                              REAL_ELPA_KERNEL_SSE_BLOCK2)
-#endif
-
-#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
-                              REAL_ELPA_KERNEL_SSE_BLOCK4)
-#endif
-
-#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
-                              REAL_ELPA_KERNEL_SSE_BLOCK6)
-#endif
-
-#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
-                              REAL_ELPA_KERNEL_AVX_BLOCK2)
-#endif
-
-#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
-                              REAL_ELPA_KERNEL_AVX_BLOCK4)
-#endif
-
-#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
-                              REAL_ELPA_KERNEL_AVX_BLOCK6)
-#endif
-
-#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
-
-#ifdef WITH_REAL_BGP_KERNEL
-                              REAL_ELPA_KERNEL_BGP)
-#endif
-
-#ifdef WITH_REAL_BGQ_KERNEL
-                              REAL_ELPA_KERNEL_BGQ)
-#endif
-
-#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
-
-    if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-   if (myid==0) then
-     print '(a)','| Two-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file%eigenvectors) then
-     write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_real2_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="Eigenvalues_real2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_real2_default_kernel.F90 elpa-2019.11.001/test/fortran_test_programs/test_real2_default_kernel.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_real2_default_kernel.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_real2_default_kernel.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,352 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The real ELPA 2 kernel is set as the default kernel.
-!> However, this can be overriden by setting
-!> the environment variable "REAL_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_real2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use output_types
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   type(output_t)             :: write_to_file
-   logical                    :: success
-   character(len=8)           :: task_suffix
-   integer(kind=ik)           :: j
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#define DATATYPE REAL
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how ELPA2 chooses a default kernel,"
-#ifdef HAVE_ENVIRONMENT_CHECKING
-      print *, "or takes the kernel defined in the environment variable,"
-#endif
-      print *, "since the ELPA API call does not contain any kernel specification"
-      print *
-      print *, " The settings are: ",trim(get_actual_real_kernel_name())," as real kernel"
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called without any kernel specification in the API,
-   ! furthermore, if the environment variable is not set, the
-   ! default kernel is called. Otherwise, the kernel defined in the
-   ! environment variable
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                              na_cols, &
-                              mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-   if (myid==0) then
-     print '(a)','| Two-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file%eigenvectors) then
-      write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_real2_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="Eigenvalues_real2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 elpa-2019.11.001/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,382 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The real ELPA 2 kernel is set as the default kernel.
-!> In this test case the qr_decomposition is used.
-!> However, this can be overriden by setting
-!> the environment variable "REAL_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_real2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-   use elpa2_utilities
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use output_types
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   type(output_t)             :: write_to_file
-   logical                    :: success
-   character(len=8)           :: task_suffix
-   integer(kind=ik)           :: j
-
-   success = .true.
-   !write_to_file = .false.
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !if (COMMAND_ARGUMENT_COUNT() /= 0) then
-   !  write(error_unit,*) "This program does not support any command-line arguments"
-   !  stop 1
-   !endif
-
-   ! override nblk
-   !   nblk = 2
-   !   na   = 4000
-   !   nev  = 1500
-
-   ! make sure na, nbl is even
-   if (mod(nblk,2 ) .ne. 0) then
-     nblk = nblk - 1
-   endif
-
-
-   ! make sure na is even
-   if (mod(na,2) .ne. 0) then
-     na = na - 1
-   endif
-   ! make sure na is at least 34
-   if (na .lt. 34) then
-     na = 34
-   endif
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#define DATATYPE REAL
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-   call timer%enable()
-
-   call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-      print *, "This is an example how ELPA2 chooses a default kernel,"
-#ifdef HAVE_ENVIRONMENT_CHECKING
-      print *, "or takes the kernel defined in the environment variable,"
-#endif
-      print *, "since the ELPA API call does not contain any kernel specification"
-      print *
-      print *, " The settings are: ",trim(get_actual_real_kernel_name())," as real kernel"
-      print *
-#ifndef HAVE_ENVIRONMENT_CHECKING
-      print *, " Notice that it is not possible with this build to set the "
-      print *, " kernel via an environment variable! To change this re-install"
-      print *, " the library and have a look at the log files"
-#endif
-      print *, " The qr-decomposition is used via the api call"
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-
-
-   ! ELPA is called without any kernel specification in the API,
-   ! furthermore, if the environment variable is not set, the
-   ! default kernel is called. Otherwise, the kernel defined in the
-   ! environment variable
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                              na_cols,                                        &
-                              mpi_comm_rows, mpi_comm_cols, mpi_comm_world,   &
-                              useQR=.true.)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-   if (myid==0) then
-     print '(a)','| Two-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file%eigenvectors) then
-     write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_real2_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="EVs_real2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_real2.F90 elpa-2019.11.001/test/fortran_test_programs/test_real2.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_real2.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_real2.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,333 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 2 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-!> The real ELPA 2 kernel is set as the default kernel.
-!> However, this can be overriden by setting
-!> the environment variable "REAL_ELPA_KERNEL" to an
-!> appropiate value.
-!>
-program test_real2
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use ELPA2
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use output_types
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
-#endif
-   type(output_t)             :: write_to_file
-   logical                    :: success
-   character(len=8)           :: task_suffix
-   integer(kind=ik)           :: j
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#define DATATYPE REAL
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-   ! set print flag in elpa1
-   elpa_print_times = .true.
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering two-stage ELPA solver ... '
-     print *
-   end if
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows,  nblk, na_cols, &
-                              mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-   if (myid==0) then
-     print '(a)','| Two-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
-   if(myid == 0) print *,'Time solve tridi        :',time_evp_solve
-   if(myid == 0) print *,'Time transform back EVs :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above)  :',time_evp_back+time_evp_solve+time_evp_fwd
-
-
-   if(write_to_file%eigenvectors) then
-     write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_real2_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="Eigenvalues_real2_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_real.F90 elpa-2019.11.001/test/fortran_test_programs/test_real.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_real.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_real.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,331 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 1 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-program test_real
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use elpa_utilities, only : error_unit
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-
-   use elpa_mpi
-#ifdef HAVE_REDIRECT
-   use redirect
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
-  use timings
-#endif
-  use output_types
-
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
-
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, &
-                                 provided_mpi_thread_level
-#endif
-   type(output_t)             :: write_to_file
-   logical                    :: success
-   character(len=8)           :: task_suffix
-   integer(kind=ik)           :: j
-   !-------------------------------------------------------------------------------
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-   STATUS = 0
-
-#define DATATYPE REAL
-#define ELPA1
-#include "elpa_test_programs_print_headers.X90"
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering one-step ELPA solver ... '
-     print *
-   end if
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                            na_cols, mpi_comm_rows, mpi_comm_cols)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_1stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-
-   if (myid==0) then
-     print '(a)','| One-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time tridiag_real     :',time_evp_fwd
-   if(myid == 0) print *,'Time solve_tridi      :',time_evp_solve
-   if(myid == 0) print *,'Time trans_ev_real    :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above):',time_evp_back+time_evp_solve+time_evp_fwd
-
-   if(write_to_file%eigenvectors) then
-     write(unit = task_suffix, fmt = '(i8.8)') myid
-     open(17,file="EVs_real_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
-     write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
-
-     do i=1,na_rows
-       do j=1,na_cols
-         write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
-       enddo
-     enddo
-     close(17)
-   endif
-
-   if(write_to_file%eigenvalues) then
-      if (myid == 0) then
-         open(17,file="Eigenvalues_real_out.txt",form='formatted',status='new')
-         do i=1,na
-            write(17,*) i,ev(i)
-         enddo
-         close(17)
-      endif
-   endif
-
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   print *," "
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-   print *," "
-#endif
-
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-
-   call EXIT(STATUS)
-
-
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/fortran_test_programs/test_real_with_c.F90 elpa-2019.11.001/test/fortran_test_programs/test_real_with_c.F90
--- elpa-2016.05.001/test/fortran_test_programs/test_real_with_c.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/fortran_test_programs/test_real_with_c.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,431 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-!>
-!> Fortran test programm to demonstrates the use of
-!> ELPA 1 real case library.
-!> If "HAVE_REDIRECT" was defined at build time
-!> the stdout and stderr output of each MPI task
-!> can be redirected to files if the environment
-!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
-!> to "true".
-!>
-!> By calling executable [arg1] [arg2] [arg3] [arg4]
-!> one can define the size (arg1), the number of
-!> Eigenvectors to compute (arg2), and the blocking (arg3).
-!> If these values are not set default values (4000, 1500, 16)
-!> are choosen.
-!> If these values are set the 4th argument can be
-!> "output", which specifies that the EV's are written to
-!> an ascii file.
-!>
-program test_real
-
-!-------------------------------------------------------------------------------
-! Standard eigenvalue problem - REAL version
-!
-! This program demonstrates the use of the ELPA module
-! together with standard scalapack routines
-!
-! Copyright of the original code rests with the authors inside the ELPA
-! consortium. The copyright of any additional modifications shall rest
-! with their original authors, but shall adhere to the licensing terms
-! distributed along with the original code in the file "COPYING".
-!
-!-------------------------------------------------------------------------------
-   use precision
-   use ELPA1
-   use elpa_utilities, only : error_unit
-   use from_c
-#ifdef WITH_OPENMP
-   use test_util
-#endif
-
-   use mod_read_input_parameters
-   use mod_check_correctness
-   use mod_setup_mpi
-   use mod_blacs_infrastructure
-   use mod_prepare_matrix
-   use elpa_mpi
-#ifdef HAVE_REDIRECT
-  use redirect
-#endif
-#ifdef HAVE_DETAILED_TIMINGS
- use timings
-#endif
- use output_types
-   implicit none
-
-   !-------------------------------------------------------------------------------
-   ! Please set system size parameters below!
-   ! na:   System size
-   ! nev:  Number of eigenvectors to be calculated
-   ! nblk: Blocking factor in block cyclic distribution
-   !-------------------------------------------------------------------------------
-   integer(kind=ik)           :: nblk
-   integer(kind=ik)           :: na, nev
-
-   integer(kind=ik)           :: np_rows, np_cols, na_rows, na_cols
-
-   integer(kind=ik)           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
-   integer(kind=ik)           :: mpi_comm_rows_fromC, mpi_comm_cols_fromC
-   integer(kind=ik)           :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol,j
-
-   integer(kind=ik)           :: my_prowFromC, my_pcolFromC
-   integer, external          :: numroc
-
-   real(kind=rk), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
-
-   real(kind=rk), allocatable :: aFromC(:,:), evFromC(:), zFromC(:,:)
-
-   integer(kind=ik)           :: iseed(4096) ! Random seed, size should be sufficient for every generator
-
-   integer(kind=ik)           :: STATUS
-#ifdef WITH_OPENMP
-   integer(kind=ik)           :: omp_get_max_threads,  required_mpi_thread_level, &
-                                 provided_mpi_thread_level
-#endif
-   type(output_t)             :: write_to_file
-
-   integer(kind=ik)           :: checksWrong, checksWrongRecv
-   logical                    :: success
-
-   success = .true.
-
-   call read_input_parameters(na, nev, nblk, write_to_file)
-
-   !-------------------------------------------------------------------------------
-   !  MPI Initialization
-   call setup_mpi(myid, nprocs)
-
-#ifdef HAVE_DETAILED_TIMINGS
-
-   ! initialise the timing functionality
-
-#ifdef HAVE_LIBPAPI
-   call timer%measure_flops(.true.)
-#endif
-
-   call timer%measure_allocated_memory(.true.)
-   call timer%measure_virtual_memory(.true.)
-   call timer%measure_max_allocated_memory(.true.)
-
-   call timer%set_print_options(&
-#ifdef HAVE_LIBPAPI
-                print_flop_count=.true., &
-                print_flop_rate=.true., &
-#endif
-                print_allocated_memory = .true. , &
-                print_virtual_memory=.true., &
-                print_max_allocated_memory=.true.)
-
-
-  call timer%enable()
-
-  call timer%start("program")
-#endif
-   !-------------------------------------------------------------------------------
-   ! Selection of number of processor rows/columns
-   ! We try to set up the grid square-like, i.e. start the search for possible
-   ! divisors of nprocs with a number next to the square root of nprocs
-   ! and decrement it until a divisor is found.
-
-
-   STATUS = 0
-#ifdef WITH_OPENMP
-   if (myid .eq. 0) then
-      print *,"Threaded version of test program"
-      print *,"Using ",omp_get_max_threads()," threads"
-      print *," "
-   endif
-#endif
-#ifndef WITH_MPI
-   if (myid .eq. 0) then
-     print *,"This version of ELPA does not support MPI parallelisation"
-     print *,"For MPI support re-build ELPA with appropiate flags"
-     print *," "
-   endif
-#endif
-
-#ifdef WITH_MPI
-    call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-#endif
-
-
-#ifdef HAVE_REDIRECT
-   if (check_redirect_environment_variable()) then
-     if (myid .eq. 0) then
-       print *," "
-       print *,"Redirection of mpi processes is used"
-       print *," "
-       if (create_directories() .ne. 1) then
-         write(error_unit,*) "Unable to create directory for stdout and stderr!"
-         stop
-       endif
-      endif
-#ifdef WITH_MPI
-      call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
-#endif
-      call redirect_stdout(myid)
-    endif
-#endif
-
-   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
-      if(mod(nprocs,np_cols) == 0 ) exit
-   enddo
-   ! at the end of the above loop, nprocs is always divisible by np_cols
-
-   np_rows = nprocs/np_cols
-
-   if(myid==0) then
-      print *
-      print '(a)','Standard eigenvalue problem - REAL version'
-      print *
-      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
-      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
-      print *
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Set up BLACS context and MPI communicators
-   !
-   ! The BLACS context is only necessary for using Scalapack.
-   !
-   ! For ELPA, the MPI communicators along rows/cols are sufficient,
-   ! and the grid setup may be done in an arbitrary way as long as it is
-   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
-   ! process has a unique (my_prow,my_pcol) pair).
-
-   call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
-                         nprow, npcol, my_prow, my_pcol)
-
-   if (myid==0) then
-     print '(a)','| Past BLACS_Gridinfo.'
-   end if
-
-   my_prowFromC = my_prow
-   my_pcolFromC = my_pcol
-
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators.
-
-   mpierr = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                   mpi_comm_rows, mpi_comm_cols)
-
-   ! call here a c function, which via the c-interface in turn calls the
-   ! appropiate elpa function
-   mpierr = call_elpa_get_comm_from_c(mpi_comm_world, my_prowFromC, my_pcolFromC, &
-                                      mpi_comm_rows_fromC, mpi_comm_cols_fromC)
-
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
-
-   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
-                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
-
-   if (myid==0) then
-     print '(a)','| Past scalapack descriptor setup.'
-   end if
-
-   !-------------------------------------------------------------------------------
-   ! Allocate matrices and set up a test matrix for the eigenvalue problem
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%start("set up matrix")
-#endif
-   allocate(a (na_rows,na_cols))
-   allocate(z (na_rows,na_cols))
-   allocate(as(na_rows,na_cols))
-
-   allocate(ev(na))
-
-   allocate(aFromC (na_rows,na_cols))
-   allocate(zFromC (na_rows,na_cols))
-
-   allocate(evFromC(na))
-
-   call prepare_matrix(na, myid, sc_desc, iseed,  a, z, as)
-
-   aFromC = a
-   zFromC = z
-   evFromC = ev
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("set up matrix")
-#endif
-
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
-
-   if (myid==0) then
-     print '(a)','| Entering one-step ELPA solver ... '
-     print *
-   end if
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   success = solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                          na_cols, mpi_comm_rows, mpi_comm_cols)
-
-   if (.not.(success)) then
-      write(error_unit,*) "solve_evp_real_1stage produced an error! Aborting..."
-#ifdef WITH_MPI
-      call MPI_ABORT(mpi_comm_world, 1, mpierr)
-#endif
-   endif
-
-
-   if (myid==0) then
-     print '(a)','| One-step ELPA solver complete.'
-     print *
-   end if
-
-   if(myid == 0) print *,'Time tridiag_real     :',time_evp_fwd
-   if(myid == 0) print *,'Time solve_tridi      :',time_evp_solve
-   if(myid == 0) print *,'Time trans_ev_real    :',time_evp_back
-   if(myid == 0) print *,'Total time (sum above):',time_evp_back+time_evp_solve+time_evp_fwd
-
-   ! call the c function
-#ifdef WITH_MPI
-   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
-#endif
-   if (myid==0) then
-     print *," "
-     print '(a)','| Testing with C-interface ... '
-     print *," "
-   end if
-
-   success = solve_elpa1_real_call_from_c(na, nev, aFromC, na_rows, evFromC, zFromC, na_rows, nblk, &
-                                          na_cols, mpi_comm_rows_fromC, mpi_comm_cols_fromC )
-
-   if (myid==0) then
-     print *," "
-     print '(a)','| C call done... '
-     print *," "
-   end if
-   ! check whether c results are the same
-   checksWrong = 0
-   do j=1,na_cols
-     do i=1,na_rows
-       if (a(i,j) .ne. aFromC(i,j)) then
-         print *,"results for a from Fortran and C are not the same!"
-         print *,i,j,a(i,j),aFromC(i,j)
-         checksWrong = 1
-         cycle
-       endif
-       if (z(i,j) .ne. zFromC(i,j)) then
-         print *,"results for z from Fortran and C are not the same!"
-         print *,i,j,z(i,j),zFromC(i,j)
-         checksWrong = 1
-       endif
-
-     enddo
-   enddo
-
-   ! reduction
-#ifdef WITH_MPI
-   call mpi_allreduce(checksWrong, checksWrongRecv,1,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
-   checksWrong = checksWrongRecv
-#endif
-   if (checksWrong == 0) then
-     if (myid == 0) then
-       print *,' Checks for matrix a and z are ok... '
-     endif
-   endif
-
-   checksWrong = 0
-   do i=1,na
-     if (ev(i) .ne. evFromC(i)) then
-       print *,"results for EV from Fortran and C are not the same!"
-       print *,i,ev(i),evFromC(i)
-       checksWrong = 1
-     endif
-   enddo
-
-   ! reduction
-#ifdef WITH_MPI
-   call mpi_allreduce(checksWrong, checksWrongRecv,1,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
-   checksWrong = checksWrongRecv
-#endif
-   if (checksWrong == 0) then
-     if (myid == 0) then
-       print *,' Checks for EVs are ok... '
-     endif
-   endif
-
-   !-------------------------------------------------------------------------------
-   ! Test correctness of result (using plain scalapack routines)
-   allocate(tmp1(na_rows,na_cols))
-   allocate(tmp2(na_rows,na_cols))
-
-   status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-   deallocate(a)
-   deallocate(as)
-
-   deallocate(z)
-   deallocate(tmp1)
-   deallocate(tmp2)
-   deallocate(ev)
-
-#ifdef HAVE_DETAILED_TIMINGS
-   call timer%stop("program")
-   print *," "
-   print *,"Timings program:"
-   print *," "
-   call timer%print("program")
-   print *," "
-   print *,"End timings program"
-   print *," "
-   print *,"End timings program"
-#endif
-#ifdef WITH_MPI
-   call blacs_gridexit(my_blacs_ctxt)
-   call mpi_finalize(mpierr)
-#endif
-   call EXIT(STATUS)
-
-
-end
-
-!-------------------------------------------------------------------------------
diff -Nru elpa-2016.05.001/test/shared/mod_tests_blas_interfaces.F90 elpa-2019.11.001/test/shared/mod_tests_blas_interfaces.F90
--- elpa-2016.05.001/test/shared/mod_tests_blas_interfaces.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/mod_tests_blas_interfaces.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,53 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+#include "config-f90.h"
+#define PRECISION_MODULE precision_for_tests
+module tests_blas_interfaces
+  use iso_c_binding
+  use precision_for_tests
+
+  implicit none
+
+#include "../../src/helpers/fortran_blas_interfaces.F90"
+
+end module
diff -Nru elpa-2016.05.001/test/shared/mod_tests_scalapack_interfaces.F90 elpa-2019.11.001/test/shared/mod_tests_scalapack_interfaces.F90
--- elpa-2016.05.001/test/shared/mod_tests_scalapack_interfaces.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/mod_tests_scalapack_interfaces.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,56 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPCDF
+
+
+#include "config-f90.h"
+#define PRECISION_MODULE precision_for_tests
+module tests_scalapack_interfaces
+  use iso_c_binding
+  use precision_for_tests
+
+  implicit none
+
+#include "../../src/helpers/fortran_scalapack_interfaces.F90"
+
+end module
+
+
diff -Nru elpa-2016.05.001/test/shared/test_analytic.F90 elpa-2019.11.001/test/shared/test_analytic.F90
--- elpa-2016.05.001/test/shared/test_analytic.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_analytic.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,190 @@
+! (c) Copyright Pavel Kus, 2017, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+
+#include "../Fortran/assert.h"
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+
+module test_analytic
+
+  use test_util
+#ifdef HAVE_DETAILED_TIMINGS
+  use ftimings
+#else
+  use timings_dummy
+#endif
+  use precision_for_tests
+
+  interface prepare_matrix_analytic
+    module procedure prepare_matrix_analytic_complex_double
+    module procedure prepare_matrix_analytic_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure prepare_matrix_analytic_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure prepare_matrix_analytic_complex_single
+#endif
+  end interface
+
+  interface check_correctness_analytic
+    module procedure check_correctness_analytic_complex_double
+    module procedure check_correctness_analytic_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure check_correctness_analytic_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure check_correctness_analytic_complex_single
+#endif
+  end interface
+
+
+  interface print_matrix
+    module procedure print_matrix_complex_double
+    module procedure print_matrix_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure print_matrix_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure print_matrix_complex_single
+#endif
+  end interface
+
+  TEST_INT_TYPE, parameter, private  :: num_primes = 3
+  TEST_INT_TYPE, parameter, private  :: primes(num_primes) = (/2,3,5/)
+
+  TEST_INT_TYPE, parameter, private  :: ANALYTIC_MATRIX = 0
+  TEST_INT_TYPE, parameter, private  :: ANALYTIC_EIGENVECTORS = 1
+  TEST_INT_TYPE, parameter, private  :: ANALYTIC_EIGENVALUES = 2
+
+  contains
+
+  function decompose(num, decomposition) result(possible)
+    implicit none
+    TEST_INT_TYPE, intent(in)   :: num
+    TEST_INT_TYPE, intent(out)  :: decomposition(num_primes)
+    logical                        :: possible
+    TEST_INT_TYPE               :: reminder, prime, prime_id
+
+    decomposition = 0
+    possible = .true.
+    reminder = num
+    do prime_id = 1, num_primes
+      prime = primes(prime_id)
+      do while (MOD(reminder, prime) == 0)
+        decomposition(prime_id) = decomposition(prime_id) + 1
+        reminder = reminder / prime
+      end do
+    end do
+    if(reminder > 1) then
+      possible = .false.
+    end if
+  end function
+
+  function compose(decomposition) result(num)
+    implicit none
+    TEST_INT_TYPE, intent(in)   :: decomposition(num_primes)
+    TEST_INT_TYPE               :: num, prime_id
+
+    num = 1;
+    do prime_id = 1, num_primes
+      num = num * primes(prime_id) ** decomposition(prime_id)
+    end do
+  end function
+
+
+#include "../../src/general/prow_pcol.F90"
+#include "../../src/general/map_global_to_local.F90"
+
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_analytic_template.F90"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_analytic_template.F90"
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_analytic_template.F90"
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_analytic_template.F90"
+#undef SINGLE_PRECISION
+#undef REALCASE
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+
+end module
diff -Nru elpa-2016.05.001/test/shared/test_analytic_template.F90 elpa-2019.11.001/test/shared/test_analytic_template.F90
--- elpa-2016.05.001/test/shared/test_analytic_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_analytic_template.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,667 @@
+!  (c) Copyright Pavel Kus, 2017, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+
+
+  subroutine prepare_matrix_analytic_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol, print_times)
+    use precision_for_tests
+
+    implicit none
+    TEST_INT_TYPE, intent(in)                       :: na, nblk, myid, np_rows, np_cols, my_prow, my_pcol
+    MATH_DATATYPE(kind=REAL_DATATYPE), intent(inout):: a(:,:)
+    logical, optional                               :: print_times
+    logical                                         :: print_timer
+    TEST_INT_TYPE                                   :: globI, globJ, locI, locJ, pi, pj, levels(num_primes)
+    integer(kind=c_int)                             :: loc_I, loc_J, p_i, p_j
+#ifdef HAVE_DETAILED_TIMINGS
+    type(timer_t)                                   :: timer
+#else
+    type(timer_dummy_t)                             :: timer
+#endif
+
+    call timer%enable()
+    call timer%start("prepare_matrix_analytic")
+
+    print_timer = .true.
+
+    if (present(print_times)) then
+      print_timer = print_times
+    endif
+
+    ! for debug only, do it systematicaly somehow ... unit tests
+    call check_module_sanity_&
+            &MATH_DATATYPE&
+            &_&
+            &PRECISION&
+            &(myid)
+
+    if(.not. decompose(na, levels)) then
+      if(myid == 0) then
+        print *, "Analytic test can be run only with matrix sizes of the form 2^n * 3^m * 5^o"
+        stop 1
+      end if
+    end if
+
+    call timer%start("loop")
+    do globI = 1, na
+
+      p_i = prow(int(globI,kind=c_int), int(nblk,kind=c_int), int(np_rows,kind=c_int))
+      pi = int(p_i,kind=INT_TYPE)
+      if (my_prow .ne. pi) cycle
+
+      do globJ = 1, na
+
+        p_j = pcol(int(globJ,kind=c_int), int(nblk,kind=c_int), int(np_cols,kind=c_int))
+        pj = int(p_j,kind=INT_TYPE)
+        if (my_pcol .ne. pj) cycle
+
+        if(map_global_array_index_to_local_index(int(globI,kind=c_int), int(globJ,kind=c_int), loc_I, loc_J, &
+                 int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
+                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
+           locI = int(loc_i,kind=INT_TYPE)      
+           locJ = int(loc_j,kind=INT_TYPE)      
+           call timer%start("evaluation")
+           a(locI, locJ) = analytic_matrix_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION&
+              &(na, globI, globJ)
+          call timer%stop("evaluation")
+        else
+          print *, "Warning ... error in preparation loop of the analytic test"
+        end if
+      end do
+    end do
+    call timer%stop("loop")
+
+    call timer%stop("prepare_matrix_analytic")
+    if(myid == 0 .and. print_timer) then
+      call timer%print("prepare_matrix_analytic")
+    end if
+    call timer%free()
+  end subroutine
+
+  function check_correctness_analytic_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, check_all_evals, &
+      check_eigenvectors, print_times) result(status)
+    use precision_for_tests
+    
+    implicit none
+#include "./test_precision_kinds.F90"
+    TEST_INT_TYPE, intent(in)              :: na, nev, nblk, myid, np_rows, &
+                                              np_cols, my_prow, my_pcol
+    TEST_INT_TYPE                          :: status
+    TEST_INT_MPI_TYPE                      :: mpierr
+    MATH_DATATYPE(kind=rck), intent(inout) :: z(:,:)
+    real(kind=rk), intent(inout)           :: ev(:)
+    logical, intent(in)                    :: check_all_evals, check_eigenvectors
+
+    TEST_INT_TYPE                          :: globI, globJ, locI, locJ, &
+                                              levels(num_primes)
+    integer(kind=c_int)                    :: loc_I, loc_J
+    real(kind=rk)                          :: diff, max_z_diff, max_ev_diff, &
+                                              glob_max_z_diff, max_curr_z_diff
+#ifdef DOUBLE_PRECISION
+    real(kind=rk), parameter               :: tol_eigenvalues = 5e-14_rk8
+    real(kind=rk), parameter               :: tol_eigenvectors = 6e-11_rk8
+#endif
+#ifdef SINGLE_PRECISION
+    ! tolerance needs to be very high due to qr tests
+    ! it should be distinguished somehow!
+    real(kind=rk), parameter               :: tol_eigenvalues = 7e-6_rk4
+    real(kind=rk), parameter               :: tol_eigenvectors = 4e-3_rk4
+#endif
+    real(kind=rk)                          :: computed_ev, expected_ev
+    MATH_DATATYPE(kind=rck)                :: computed_z,  expected_z
+
+    MATH_DATATYPE(kind=rck)                :: max_value_for_normalization, &
+                                              computed_z_on_max_position,  &
+                                              normalization_quotient
+    MATH_DATATYPE(kind=rck)                :: max_values_array(np_rows * np_cols), &
+                                              corresponding_exact_value
+    integer(kind=c_int)                    :: max_value_idx, rank_with_max, &
+                                              rank_with_max_reduced,        &
+                                              num_checked_evals
+    integer(kind=c_int)                    :: max_idx_array(np_rows * np_cols), &
+                                              rank
+    logical, optional                      :: print_times
+    logical                                :: print_timer
+
+#ifdef HAVE_DETAILED_TIMINGS
+    type(timer_t)    :: timer
+#else
+    type(timer_dummy_t)    :: timer
+#endif
+
+    call timer%enable()
+    call timer%start("check_correctness_analytic")
+
+
+    print_timer = .true.
+    if (present(print_times)) then
+      print_timer = print_times
+    endif
+
+    if(.not. decompose(na, levels)) then
+      print *, "can not decomopse matrix size"
+      stop 1
+    end if
+
+    if(check_all_evals) then
+        num_checked_evals = na
+    else
+        num_checked_evals = nev
+    endif
+    !call print_matrix(myid, na, z, "z")
+    max_z_diff = 0.0_rk
+    max_ev_diff = 0.0_rk
+    call timer%start("loop_eigenvalues")
+    do globJ = 1, num_checked_evals
+      computed_ev = ev(globJ)
+      call timer%start("evaluation")
+      expected_ev = analytic_eigenvalues_real_&
+              &PRECISION&
+              &(na, globJ)
+      call timer%stop("evaluation")
+      diff = abs(computed_ev - expected_ev)
+      max_ev_diff = max(diff, max_ev_diff)
+    end do
+    call timer%stop("loop_eigenvalues")
+
+    call timer%start("loop_eigenvectors")
+    do globJ = 1, nev
+      max_curr_z_diff = 0.0_rk
+
+      ! eigenvectors are unique up to multiplication by scalar (complex in complex case)
+      ! to be able to compare them with analytic, we have to normalize them somehow
+      ! we will find a value in computed eigenvector with highest absolut value and enforce
+      ! such multiple of computed eigenvector, that the value on corresponding position is the same
+      ! as an corresponding value in the analytical eigenvector
+
+      ! find the maximal value in the local part of given eigenvector (with index globJ)
+      max_value_for_normalization = 0.0_rk
+      max_value_idx = -1
+      do globI = 1, na
+        if(map_global_array_index_to_local_index(int(globI,kind=c_int), int(globJ,kind=c_int), loc_I, loc_J, &
+                 int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
+                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
+          locI = int(loc_I,kind=INT_TYPE)
+          locJ = int(loc_J,kind=INT_TYPE)
+          computed_z = z(locI, locJ)
+          if(abs(computed_z) > abs(max_value_for_normalization)) then
+            max_value_for_normalization = computed_z
+            max_value_idx = int(globI,kind=c_int)
+          end if
+        end if
+      end do
+
+      ! find the global maximum and its position. From technical reasons (looking for a 
+      ! maximum of complex number), it is not so easy to do it nicely. Therefore we 
+      ! communicate local maxima to mpi rank 0 and resolve there. If we wanted to do
+      ! it without this, it would be tricky.. question of uniquness - two complex numbers
+      ! with the same absolut values, but completely different... 
+#ifdef WITH_MPI
+      call MPI_Gather(max_value_for_normalization, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
+                      max_values_array, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, 0_MPI_KIND, &
+                      int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
+      call MPI_Gather(max_value_idx, 1_MPI_KIND, MPI_INT, max_idx_array, 1_MPI_KIND, MPI_INT, &
+                      0_MPI_KIND, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
+      max_value_for_normalization = 0.0_rk
+      max_value_idx = -1
+      do rank = 1, np_cols * np_rows 
+        if(abs(max_values_array(rank)) > abs(max_value_for_normalization)) then
+          max_value_for_normalization = max_values_array(rank)
+          max_value_idx = max_idx_array(rank)
+        end if
+      end do
+      call MPI_Bcast(max_value_for_normalization, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
+                     0_MPI_KIND, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
+      call MPI_Bcast(max_value_idx, 1_MPI_KIND, MPI_INT, 0_MPI_KIND, &
+                     int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
+#endif
+      ! we decided what the maximum computed value is. Calculate expected value on the same 
+      if(abs(max_value_for_normalization) < 0.0001_rk) then 
+        if(myid == 0) print *, 'Maximal value in eigenvector too small     :', max_value_for_normalization
+        status =1
+        return
+      end if
+      call timer%start("evaluation_helper")
+      corresponding_exact_value  = analytic_eigenvectors_&
+                                       &MATH_DATATYPE&
+                                       &_&
+                                       &PRECISION&
+                                       &(na, int(max_value_idx,kind=INT_TYPE), globJ)
+      call timer%stop("evaluation_helper")
+      normalization_quotient = corresponding_exact_value / max_value_for_normalization
+      ! write(*,*) "normalization q", normalization_quotient
+
+      ! compare computed and expected eigenvector values, but take into account normalization quotient
+      do globI = 1, na
+        if(map_global_array_index_to_local_index(int(globI,kind=c_int), int(globJ,kind=c_int), loc_I, loc_J, &
+                 int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
+                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
+           locI = int(loc_I,kind=INT_TYPE)
+           locJ = int(loc_J,kind=INT_TYPE)
+           computed_z = z(locI, locJ)
+           call timer%start("evaluation")
+           expected_z = analytic_eigenvectors_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION&
+              &(na, globI, globJ)
+           call timer%stop("evaluation")
+           max_curr_z_diff = max(abs(normalization_quotient * computed_z - expected_z), max_curr_z_diff)
+        end if
+      end do
+      ! we have max difference of one of the eigenvectors, update global
+      max_z_diff = max(max_z_diff, max_curr_z_diff)
+    end do !globJ
+    call timer%stop("loop_eigenvectors")
+
+#ifdef WITH_MPI
+    call mpi_allreduce(max_z_diff, glob_max_z_diff, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, &
+                       int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
+#else
+    glob_max_z_diff = max_z_diff
+#endif
+    if(myid == 0) print *, 'Maximum error in eigenvalues      :', max_ev_diff
+    if (check_eigenvectors) then
+      if(myid == 0) print *, 'Maximum error in eigenvectors     :', glob_max_z_diff
+    endif
+
+    status = 0
+    if (nev .gt. 2) then
+      if (max_ev_diff .gt. tol_eigenvalues .or. max_ev_diff .eq. 0.0_rk) status = 1
+      if (check_eigenvectors) then
+        if (glob_max_z_diff .gt. tol_eigenvectors .or. glob_max_z_diff .eq. 0.0_rk) status = 1
+      endif
+    else
+      if (max_ev_diff .gt. tol_eigenvalues) status = 1
+      if (check_eigenvectors) then
+        if (glob_max_z_diff .gt. tol_eigenvectors) status = 1
+      endif
+    endif
+
+    call timer%stop("check_correctness_analytic")
+    if(myid == 0 .and. print_timer) then
+      call timer%print("check_correctness_analytic")
+    end if
+    call timer%free()
+  end function
+
+
+  function analytic_matrix_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(na, i, j) result(element)
+    use precision_for_tests
+
+    implicit none
+    TEST_INT_TYPE, intent(in) :: na, i, j
+    MATH_DATATYPE(kind=REAL_DATATYPE)     :: element
+
+    element = analytic_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(na, i, j, ANALYTIC_MATRIX)
+
+  end function
+
+  function analytic_eigenvectors_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(na, i, j) result(element)
+    use precision_for_tests
+
+    implicit none
+    TEST_INT_TYPE, intent(in) :: na, i, j
+    MATH_DATATYPE(kind=REAL_DATATYPE)               :: element
+
+    element = analytic_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(na, i, j, ANALYTIC_EIGENVECTORS)
+
+  end function
+
+  function analytic_eigenvalues_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(na, i) result(element)
+    use precision_for_tests
+
+    implicit none
+    TEST_INT_TYPE, intent(in) :: na, i
+    real(kind=REAL_DATATYPE)              :: element
+
+    element = analytic_real_&
+    &PRECISION&
+    &(na, i, i, ANALYTIC_EIGENVALUES)
+
+  end function
+
+  function analytic_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(na, i, j, what) result(element)
+    use precision_for_tests
+
+    implicit none
+#include "./test_precision_kinds.F90"
+    TEST_INT_TYPE, intent(in)     :: na, i, j, what
+    MATH_DATATYPE(kind=rck)       :: element, mat2x2(2,2), mat(5,5)
+    real(kind=rk)                 :: a, am, amp
+    TEST_INT_TYPE                 :: levels(num_primes)
+    TEST_INT_TYPE                 :: ii, jj, m, prime_id, prime, total_level, level
+
+    real(kind=rk), parameter      :: s = 0.5_rk
+    real(kind=rk), parameter      :: c = 0.86602540378443864679_rk
+    real(kind=rk), parameter      :: sq2 = 1.4142135623730950488_rk
+
+    real(kind=rk), parameter      :: largest_ev = 2.0_rk
+
+    assert(i <= na)
+    assert(j <= na)
+    assert(i >= 0)
+    assert(j >= 0)
+    assert(decompose(na, levels))
+    ! go to zero-based indexing
+    ii = i - 1
+    jj = j - 1
+    if (na .gt. 2) then
+      a = exp(log(largest_ev)/(na-1))
+    else
+      a = exp(log(largest_ev)/(1))
+    endif
+
+    element = 1.0_rck
+#ifdef COMPLEXCASE
+    element = (1.0_rk, 0.0_rk)
+#endif
+    total_level = 0
+    am = a
+    do prime_id = 1,num_primes
+      prime = primes(prime_id)
+      do  level = 1, levels(prime_id)
+        amp = am**(prime-1)
+        total_level = total_level + 1
+        if(what == ANALYTIC_MATRIX) then
+#ifdef REALCASE
+          mat2x2 = reshape((/ c*c + amp * s*s, (amp - 1.0_rk) * s*c,  &
+                           (amp - 1.0_rk) * s*c, s*s + amp * c*c  /), &
+                                      (/2, 2/), order=(/2,1/))
+#endif
+#ifdef COMPLEXCASE
+          mat2x2 = reshape((/ 0.5_rck * (amp + 1.0_rck) * (1.0_rk, 0.0_rk),   sq2/4.0_rk * (amp - 1.0_rk) * (1.0_rk, 1.0_rk),   &
+                              sq2/4.0_rk * (amp - 1.0_rk) * (1.0_rk, -1.0_rk),  0.5_rck * (amp + 1.0_rck) * (1.0_rk, 0.0_rk) /), &
+                                      (/2, 2/), order=(/2,1/))
+! intel 2018 does not reshape correctly (one would have to specify order=(/1,2/)
+! until this is resolved, I resorted to the following
+          mat2x2(1,2) = sq2/4.0_rk * (amp - 1.0_rk) * (1.0_rk, 1.0_rk)
+          mat2x2(2,1) = sq2/4.0_rk * (amp - 1.0_rk) * (1.0_rk, -1.0_rk)
+#endif
+        else if(what == ANALYTIC_EIGENVECTORS) then
+#ifdef REALCASE
+          mat2x2 = reshape((/ c, s,  &
+                           -s,  c  /), &
+                                (/2, 2/), order=(/2,1/))
+! intel 2018 does not reshape correctly (one would have to specify order=(/1,2/)
+! until this is resolved, I resorted to the following
+          mat2x2(1,2) = s
+          mat2x2(2,1) = -s
+#endif
+#ifdef COMPLEXCASE
+          mat2x2 = reshape((/ -sq2/2.0_rck * (1.0_rk, 0.0_rk),       -sq2/2.0_rck * (1.0_rk, 0.0_rk),  &
+                              0.5_rk * (1.0_rk, -1.0_rk),  0.5_rk * (-1.0_rk, 1.0_rk)  /), &
+                                (/2, 2/), order=(/2,1/))
+! intel 2018 does not reshape correctly (one would have to specify order=(/1,2/)
+! until this is resolved, I resorted to the following
+          mat2x2(1,2) = -sq2/2.0_rck * (1.0_rk, 0.0_rk)
+          mat2x2(2,1) = 0.5_rk * (1.0_rk, -1.0_rk)
+#endif
+        else if(what == ANALYTIC_EIGENVALUES) then
+          mat2x2 = reshape((/ 1.0_rck, 0.0_rck,  &
+                           0.0_rck, amp  /), &
+                                 (/2, 2/), order=(/2,1/))
+        else
+          assert(.false.)
+        end if
+
+        mat = 0.0_rck
+        if(prime == 2) then
+          mat(1:2, 1:2) = mat2x2
+        else if(prime == 3) then
+          mat((/1,3/),(/1,3/)) = mat2x2
+          if(what == ANALYTIC_EIGENVECTORS) then
+            mat(2,2) = 1.0_rck
+          else
+            mat(2,2) = am
+          end if
+        else if(prime == 5) then
+          mat((/1,5/),(/1,5/)) = mat2x2
+          if(what == ANALYTIC_EIGENVECTORS) then
+            mat(2,2) = 1.0_rck
+            mat(3,3) = 1.0_rck
+            mat(4,4) = 1.0_rck
+          else
+            mat(2,2) = am
+            mat(3,3) = am**2
+            mat(4,4) = am**3
+          end if
+        else
+          assert(.false.)
+        end if
+
+  !      write(*,*) "calc value, elem: ", element, ", mat: ", mod(ii,2), mod(jj,2),  mat(mod(ii,2), mod(jj,2)), "am ", am
+  !      write(*,*) " matrix mat", mat
+        element = element * mat(mod(ii,prime) + 1, mod(jj,prime) + 1)
+        ii = ii / prime
+        jj = jj / prime
+
+        am = am**prime
+      end do
+    end do
+    !write(*,*) "returning value ", element
+  end function
+
+
+  subroutine print_matrix_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(myid, na, mat, mat_name)
+    use precision_for_tests
+
+    implicit none
+#include "./test_precision_kinds.F90"
+    TEST_INT_TYPE, intent(in)    :: myid, na
+    character(len=*), intent(in) :: mat_name
+    MATH_DATATYPE(kind=rck)      :: mat(na, na)
+    TEST_INT_TYPE                :: i,j
+    character(len=20)            :: na_str
+
+    if(myid .ne. 0) &
+      return
+    write(*,*) "Matrix: "//trim(mat_name)
+    write(na_str, *) na
+    do i = 1, na
+#ifdef REALCASE
+      write(*, '('//trim(na_str)//'f8.3)') mat(i, :)
+#endif
+#ifdef COMPLEXCASE
+      write(*,'('//trim(na_str)//'(A,f8.3,A,f8.3,A))') ('(', real(mat(i,j)), ',', aimag(mat(i,j)), ')', j=1,na)
+#endif
+    end do
+    write(*,*)
+  end subroutine
+
+
+  subroutine check_matrices_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(myid, na)
+    use precision_for_tests
+
+    implicit none
+#include "./test_precision_kinds.F90"
+    TEST_INT_TYPE, intent(in) :: myid, na
+    MATH_DATATYPE(kind=rck)   :: A(na, na), S(na, na), L(na, na), res(na, na)
+    TEST_INT_TYPE             :: i, j, decomposition(num_primes)
+
+    real(kind=rk)             :: err
+#ifdef DOUBLE_PRECISION
+    real(kind=rk), parameter  :: TOL =  1e-8
+#endif
+#ifdef SINGLE_PRECISION
+    real(kind=rk), parameter  :: TOL =  1e-4
+#endif
+
+    assert(decompose(na, decomposition))
+
+    do i = 1, na
+      do j = 1, na
+        A(i,j) = analytic_matrix_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION&
+              &(na, i, j)
+        S(i,j) = analytic_eigenvectors_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION&
+              &(na, i, j)
+        L(i,j) = analytic_&
+              &MATH_DATATYPE&
+              &_&
+              &PRECISION&
+              &(na, i, j, ANALYTIC_EIGENVALUES)
+      end do
+    end do
+
+    res = matmul(A,S) - matmul(S,L)
+    err = maxval(abs(res))
+    
+    if(err > TOL) then
+      print *, "WARNING: sanity test in module analytic failed, error is ", err
+    end if
+
+    if(.false.) then
+    !if(na == 2 .or. na == 5) then
+      call print_matrix(myid, na, A, "A")
+      call print_matrix(myid, na, S, "S")
+      call print_matrix(myid, na, L, "L")
+
+      call print_matrix(myid, na, matmul(A,S), "AS")
+      call print_matrix(myid, na, matmul(S,L), "SL")
+
+      call print_matrix(myid, na, res , "res")
+    end if
+
+  end subroutine
+
+  subroutine check_module_sanity_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    &(myid)
+    use precision_for_tests
+
+    implicit none
+    TEST_INT_TYPE, intent(in)   :: myid
+    TEST_INT_TYPE               :: decomposition(num_primes), i
+    TEST_INT_TYPE, parameter    :: check_sizes(7) = (/2, 3, 5, 6, 10, 25, 150/)
+    if(myid == 0) print *, "Checking test_analytic module sanity.... "
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+    assert(decompose(1500_lik, decomposition))
+#else
+    assert(decompose(1500_ik, decomposition))
+#endif
+    assert(all(decomposition == (/2,1,3/)))
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+    assert(decompose(6_lik,decomposition))
+#else
+    assert(decompose(6_ik,decomposition))
+#endif
+    assert(all(decomposition == (/1,1,0/)))
+
+    do i =1, size(check_sizes)
+      call check_matrices_&
+          &MATH_DATATYPE&
+          &_&
+          &PRECISION&
+          &(myid, check_sizes(i))
+    end do
+
+    if(myid == 0) print *, "Checking test_analytic module sanity.... DONE"
+
+  end subroutine
diff -Nru elpa-2016.05.001/test/shared/test_blacs_infrastructure.F90 elpa-2019.11.001/test/shared/test_blacs_infrastructure.F90
--- elpa-2016.05.001/test/shared/test_blacs_infrastructure.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_blacs_infrastructure.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,207 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#define TEST_C_INT_TYPE_PTR long int*
+#define TEST_C_INT_TYPE long int
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#define TEST_C_INT_TYPE_PTR int*
+#define TEST_C_INT_TYPE int
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#define TEST_C_INT_MPI_TYPE_PTR long int*
+#define TEST_C_INT_MPI_TYPE long int
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#define TEST_C_INT_MPI_TYPE_PTR int*
+#define TEST_C_INT_MPI_TYPE int
+#endif
+
+module test_blacs_infrastructure
+
+  contains
+
+    !c> void set_up_blacsgrid_f(TEST_C_INT_TYPE mpi_comm_parent, TEST_C_INT_TYPE np_rows, 
+    !c>                         TEST_C_INT_TYPE np_cols, char layout,
+    !c>                         TEST_C_INT_TYPE_PTR my_blacs_ctxt, TEST_C_INT_TYPE_PTR my_prow, 
+    !c>                         TEST_C_INT_TYPE_PTR my_pcol);
+    subroutine set_up_blacsgrid(mpi_comm_parent, np_rows, np_cols, layout, &
+                                my_blacs_ctxt, my_prow, my_pcol) bind(C, name="set_up_blacsgrid_f")
+
+      use precision_for_tests
+      use test_util
+
+      implicit none
+      TEST_INT_TYPE, intent(in), value    :: mpi_comm_parent, np_rows, np_cols
+#ifdef SXAURORA
+      character(len=1), intent(in)        :: layout
+#else
+      character(len=1), intent(in), value :: layout
+#endif
+      TEST_INT_TYPE, intent(out)          :: my_blacs_ctxt, my_prow, my_pcol
+
+#ifdef WITH_MPI
+      TEST_INT_TYPE :: np_rows_, np_cols_
+#endif
+
+      if (layout /= 'R' .and. layout /= 'C') then
+        print *, "layout must be 'R' or 'C'"
+        stop 1
+      end if
+
+      my_blacs_ctxt = mpi_comm_parent
+#ifdef WITH_MPI
+      call BLACS_Gridinit(my_blacs_ctxt, layout, np_rows, np_cols)
+      call BLACS_Gridinfo(my_blacs_ctxt, np_rows_, np_cols_, my_prow, my_pcol)
+      if (np_rows /= np_rows_) then
+        print *, "BLACS_Gridinfo returned different values for np_rows as set by BLACS_Gridinit"
+        stop 1
+      endif
+      if (np_cols /= np_cols_) then
+        print *, "BLACS_Gridinfo returned different values for np_cols as set by BLACS_Gridinit"
+        stop 1
+      endif
+#else
+      my_prow = 0
+      my_pcol = 0
+#endif
+    end subroutine
+
+    subroutine set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
+                                       np_rows, np_cols, na_rows,  &
+                                       na_cols, sc_desc, my_blacs_ctxt, info)
+
+      use elpa_utilities, only : error_unit
+      use test_util
+      use precision_for_tests
+      use tests_scalapack_interfaces
+      implicit none
+
+      TEST_INT_TYPE, intent(in)    :: na, nblk, my_prow, my_pcol, np_rows,   &
+                                       np_cols, &
+                                       my_blacs_ctxt
+      TEST_INT_TYPE, intent(inout) :: info
+      TEST_INT_TYPE, intent(out)   :: na_rows, na_cols, sc_desc(1:9)
+
+#ifdef WITH_MPI
+      TEST_INT_MPI_TYPE            :: mpierr
+
+      sc_desc(:) = 0
+      ! determine the neccessary size of the distributed matrices,
+      ! we use the scalapack tools routine NUMROC
+
+      na_rows = numroc(na, nblk, my_prow, 0_BLAS_KIND, np_rows)
+      na_cols = numroc(na, nblk, my_pcol, 0_BLAS_KIND, np_cols)
+
+      ! set up the scalapack descriptor for the checks below
+      ! For ELPA the following restrictions hold:
+      ! - block sizes in both directions must be identical (args 4 a. 5)
+      ! - first row and column of the distributed matrix must be on
+      !   row/col 0/0 (arg 6 and 7)
+
+      call descinit(sc_desc, na, na, nblk, nblk, 0_BLAS_KIND, 0_BLAS_KIND, &
+                    my_blacs_ctxt, na_rows, info)
+
+      if (info .ne. 0) then
+        write(error_unit,*) 'Error in BLACS descinit! info=',info
+        write(error_unit,*) 'Most likely this happend since you want to use'
+        write(error_unit,*) 'more MPI tasks than are possible for your'
+        write(error_unit,*) 'problem size (matrix size and blocksize)!'
+        write(error_unit,*) 'The blacsgrid can not be set up properly'
+        write(error_unit,*) 'Try reducing the number of MPI tasks...'
+        call MPI_ABORT(int(mpi_comm_world,kind=MPI_KIND), 1_MPI_KIND, mpierr)
+      endif
+#else /* WITH_MPI */
+      na_rows = na
+      na_cols = na
+#endif /* WITH_MPI */
+
+    end subroutine
+
+    !c> void set_up_blacs_descriptor_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nblk, 
+    !c>                                TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol,
+    !c>                                TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols,
+    !c>                                TEST_C_INT_TYPE_PTR na_rows, TEST_C_INT_TYPE_PTR na_cols,
+    !c>                                TEST_C_INT_TYPE sc_desc[9],
+    !c>                                TEST_C_INT_TYPE my_blacs_ctxt,
+    !c>                                TEST_C_INT_TYPE_PTR info);
+    subroutine set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, &
+                                         np_rows, np_cols, na_rows,  &
+                                         na_cols, sc_desc,           &
+                                         my_blacs_ctxt, info)        &
+                                         bind(C, name="set_up_blacs_descriptor_f")
+
+      use iso_c_binding
+      implicit none
+
+
+      TEST_INT_TYPE, value :: na, nblk, my_prow, my_pcol, np_rows, &
+                                    np_cols, my_blacs_ctxt
+      TEST_INT_TYPE        :: na_rows, na_cols, info, sc_desc(1:9)
+
+      call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
+                                   np_rows, np_cols, na_rows,  &
+                                   na_cols, sc_desc, my_blacs_ctxt, info)
+
+
+    end subroutine
+
+    
+   function index_l2g(idx_loc, nblk, iproc, nprocs) result(indexl2g)
+     use precision_for_tests
+     implicit none
+     TEST_INT_TYPE :: indexl2g
+     TEST_INT_TYPE :: idx_loc, nblk, iproc, nprocs
+     indexl2g = nprocs * nblk * ((idx_loc-1) / nblk) + mod(idx_loc-1,nblk) + mod(nprocs+iproc, nprocs)*nblk + 1
+     return
+   end function
+
+end module
diff -Nru elpa-2016.05.001/test/shared/test_check_correctness.F90 elpa-2019.11.001/test/shared/test_check_correctness.F90
--- elpa-2016.05.001/test/shared/test_check_correctness.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_check_correctness.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,156 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: A. Marek, MPCDF
+#include "config-f90.h"
+
+module test_check_correctness
+  use test_util
+
+  interface check_correctness_evp_numeric_residuals
+    module procedure check_correctness_evp_numeric_residuals_complex_double
+    module procedure check_correctness_evp_numeric_residuals_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure check_correctness_evp_numeric_residuals_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure check_correctness_evp_numeric_residuals_complex_single
+#endif
+  end interface
+  
+  interface check_correctness_evp_numeric_residuals_ss
+!     module procedure check_correctness_evp_numeric_residuals_ss_complex_double
+    module procedure check_correctness_evp_numeric_residuals_ss_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure check_correctness_evp_numeric_residuals_ss_real_single
+#endif
+! #ifdef WANT_SINGLE_PRECISION_COMPLEX
+!     module procedure check_correctness_evp_numeric_residuals_ss_complex_single
+! #endif
+  end interface
+
+  interface check_correctness_eigenvalues_toeplitz
+    module procedure check_correctness_eigenvalues_toeplitz_complex_double
+    module procedure check_correctness_eigenvalues_toeplitz_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure check_correctness_eigenvalues_toeplitz_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure check_correctness_eigenvalues_toeplitz_complex_single
+#endif
+  end interface
+
+  interface check_correctness_eigenvalues_frank
+    module procedure check_correctness_eigenvalues_frank_complex_double
+    module procedure check_correctness_eigenvalues_frank_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure check_correctness_eigenvalues_frank_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure check_correctness_eigenvalues_frank_complex_single
+#endif
+  end interface
+
+  interface check_correctness_cholesky
+    module procedure check_correctness_cholesky_complex_double
+    module procedure check_correctness_cholesky_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure check_correctness_cholesky_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure check_correctness_cholesky_complex_single
+#endif
+  end interface
+
+  interface check_correctness_hermitian_multiply
+    module procedure check_correctness_hermitian_multiply_complex_double
+    module procedure check_correctness_hermitian_multiply_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure check_correctness_hermitian_multiply_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure check_correctness_hermitian_multiply_complex_single
+#endif
+  end interface
+
+
+  contains
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_check_correctness_template.F90"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_check_correctness_template.F90"
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_check_correctness_template.F90"
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_check_correctness_template.F90"
+#undef SINGLE_PRECISION
+#undef REALCASE
+
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+#include "../../src/general/prow_pcol.F90"
+#include "../../src/general/map_global_to_local.F90"
+
+end module
diff -Nru elpa-2016.05.001/test/shared/test_check_correctness_template.F90 elpa-2019.11.001/test/shared/test_check_correctness_template.F90
--- elpa-2016.05.001/test/shared/test_check_correctness_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_check_correctness_template.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,1134 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: A. Marek, MPCDF
+
+
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE lik
+#define TEST_C_INT_TYPE_PTR long int*
+#define TEST_C_INT_TYPE long int
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE ik
+#define TEST_C_INT_TYPE_PTR int*
+#define TEST_C_INT_TYPE int
+#endif
+
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE lik
+#define TEST_C_INT_MPI_TYPE_PTR long int*
+#define TEST_C_INT_MPI_TYPE long int
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE ik
+#define TEST_C_INT_MPI_TYPE_PTR int*
+#define TEST_C_INT_MPI_TYPE int
+#endif
+
+#if REALCASE == 1
+    function check_correctness_evp_numeric_residuals_ss_real_&
+    &PRECISION&
+    & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status)
+      use tests_blas_interfaces
+      use tests_scalapack_interfaces
+      use precision_for_tests
+      use iso_c_binding
+      implicit none
+#include "../../src/general/precision_kinds.F90"
+      integer(kind=BLAS_KIND)             :: status, na_cols, na_rows
+      integer(kind=BLAS_KIND), intent(in) :: na, nev, nblk, myid, np_rows, np_cols, my_prow, my_pcol
+      real(kind=rk), intent(in)           :: as(:,:)
+      real(kind=rk)                       :: tmpr
+      complex(kind=rck), intent(in)       :: z(:,:)
+      real(kind=rk)                       :: ev(:)
+      complex(kind=rck), dimension(size(as,dim=1),size(as,dim=2)) :: tmp1, tmp2
+      complex(kind=rck)                   :: xc
+      
+      complex(kind=rck), allocatable      :: as_complex(:,:)
+
+      integer(kind=BLAS_KIND)             :: sc_desc(:)
+
+      integer(kind=BLAS_KIND)             :: i, j, rowLocal, colLocal
+      integer(kind=c_int)                 :: row_Local, col_Local
+      real(kind=rck)                      :: err, errmax
+
+      integer :: mpierr
+
+      ! tolerance for the residual test for different math type/precision setups
+      real(kind=rk), parameter       :: tol_res_real_double      = 5e-4_rk
+      real(kind=rk), parameter       :: tol_res_real_single      = 3e-2_rk
+      real(kind=rk), parameter       :: tol_res_complex_double   = 5e-12_rk
+      real(kind=rk), parameter       :: tol_res_complex_single   = 3e-2_rk
+      real(kind=rk)                  :: tol_res                  = tol_res_&
+                                                                          &MATH_DATATYPE&
+                                                                          &_&
+                                                                          &PRECISION
+      ! precision of generalized problem is lower
+      real(kind=rk), parameter       :: generalized_penalty = 10.0_rk
+
+      ! tolerance for the orthogonality test for different math type/precision setups
+!       real(kind=rk), parameter       :: tol_orth_real_double     = 5e-11_rk
+      real(kind=rk), parameter       :: tol_orth_real_double     = 5e-4_rk
+      real(kind=rk), parameter       :: tol_orth_real_single     = 9e-2_rk
+      real(kind=rk), parameter       :: tol_orth_complex_double  = 5e-11_rk
+      real(kind=rk), parameter       :: tol_orth_complex_single  = 9e-3_rk
+      real(kind=rk), parameter       :: tol_orth                 = tol_orth_&
+                                                                          &MATH_DATATYPE&
+                                                                          &_&
+                                                                          &PRECISION
+                                                  
+      complex(kind=rck), parameter   :: CZERO = (0.0_rck,0.0_rck), CONE = (1.0_rck,0.0_rck)
+
+
+      status = 0
+      ! Setup complex matrices and eigenvalues
+      na_rows = size(as,dim=1)
+      na_cols = size(as,dim=2)
+      
+      allocate(as_complex(na_rows,na_cols))
+      do j=1, na_cols
+        do i=1,na_rows
+#ifdef DOUBLE_PRECISION_REAL
+          as_complex(i,j) = dcmplx(as(i,j),0.0_rk)
+#else
+          as_complex(i,j) = cmplx(as(i,j),0.0_rk)
+#endif
+       enddo
+      enddo
+      
+      ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
+
+      ! tmp1 = Zi*EVi
+      tmp1(:,:) = z(:,:)
+      do i=1,nev
+#ifdef DOUBLE_PRECISION_REAL
+        xc = dcmplx(0.0_rk,ev(i))
+#else
+        xc = cmplx(0.0_rk,ev(i))
+#endif
+#ifdef WITH_MPI
+#ifdef DOUBLE_PRECISION_REAL
+        call pzscal(int(na,kind=BLAS_KIND), xc, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, 1_BLAS_KIND)
+#else
+        call pcscal(int(na,kind=BLAS_KIND), xc, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, 1_BLAS_KIND)
+#endif
+#else /* WITH_MPI */
+#ifdef DOUBLE_PRECISION_REAL
+        call zscal(int(na,kind=BLAS_KIND), xc, tmp1(:,i), 1_BLAS_KIND)
+#else
+        call cscal(int(na,kind=BLAS_KIND), xc, tmp1(:,i), 1_BLAS_KIND)
+#endif
+#endif /* WITH_MPI */
+      enddo
+
+      ! normal eigenvalue problem .. no need to multiply
+        tmp2(:,:) = tmp1(:,:)
+
+      ! tmp1 =  A * Z
+      ! as is original stored matrix, Z are the EVs
+#ifdef WITH_MPI
+#ifdef DOUBLE_PRECISION_REAL
+      call PZGEMM('N', 'N', int(na,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), &
+                  CONE, as_complex, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                  z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, CZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else
+      call PCGEMM('N', 'N', int(na,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), &
+                  CONE, as_complex, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                  z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, CZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#endif
+#else /* WITH_MPI */
+#ifdef DOUBLE_PRECISION_REAL
+      call ZGEMM('N','N',int(na,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), CONE, &
+                 as_complex, int(na,kind=BLAS_KIND), z,int(na,kind=BLAS_KIND), CZERO, tmp1, int(na,kind=BLAS_KIND) )
+#else
+      call CGEMM('N','N', int(na,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), CONE, &
+                  as_complex, int(na,kind=BLAS_KIND), z, int(na,kind=BLAS_KIND), CZERO, tmp1, int(na,kind=BLAS_KIND) )
+#endif
+#endif /* WITH_MPI */
+
+      !  tmp1 = A*Zi - Zi*EVi
+      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
+      
+      ! Get maximum norm of columns of tmp1
+      errmax = 0.0_rk
+
+      do i=1,nev
+        xc = (0.0_rk,0.0_rk)
+#ifdef WITH_MPI
+#ifdef DOUBLE_PRECISION_REAL
+        call PZDOTC(int(na,kind=BLAS_KIND), xc, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, &
+                    1_BLAS_KIND, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, 1_BLAS_KIND)
+#else
+        call PCDOTC(int(na,kind=BLAS_KIND), xc, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, &
+                    1_BLAS_KIND, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, 1_BLAS_KIND)
+#endif
+#else /* WITH_MPI */
+#ifdef DOUBLE_PRECISION_REAL
+        xc = ZDOTC(int(na,kind=BLAS_KIND) ,tmp1, 1_BLAS_KIND, tmp1, 1_BLAS_KIND)
+#else
+        xc = CDOTC(int(na,kind=BLAS_KIND) ,tmp1, 1_BLAS_KIND, tmp1, 1_BLAS_KIND)
+#endif
+#endif /* WITH_MPI */
+        errmax = max(errmax, sqrt(real(xc,kind=REAL_DATATYPE)))
+      enddo
+
+      ! Get maximum error norm over all processors
+      err = errmax
+#ifdef WITH_MPI
+      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
+#else /* WITH_MPI */
+      errmax = err
+#endif /* WITH_MPI */
+      if (myid==0) print *,'%Results of numerical residual checks, using complex arithmetic:'
+      if (myid==0) print *,'%Error Residual     :',errmax
+      if (nev .ge. 2) then
+        if (errmax .gt. tol_res .or. errmax .eq. 0.0_rk) then
+          status = 1
+        endif
+      else
+        if (errmax .gt. tol_res) then
+          status = 1
+        endif
+      endif
+
+      ! 2. Eigenvector orthogonality
+        tmp2(:,:) = z(:,:)
+      tmp1 = 0
+#ifdef WITH_MPI
+#ifdef DOUBLE_PRECISION_REAL
+      call PZGEMM('C', 'N', int(nev,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), &
+                  CONE, z, 1_BLAS_KIND, 1_BLAS_KIND, &
+                  sc_desc, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, CZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else
+      call PCGEMM('C', 'N', int(nev,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), &
+                  CONE, z, 1_BLAS_KIND, 1_BLAS_KIND, &
+                  sc_desc, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, CZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#endif
+
+#else /* WITH_MPI */
+#ifdef DOUBLE_PRECISION_REAL
+      call ZGEMM('C','N', int(nev,kind=BLAS_KIND) , int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND),CONE, z, &
+                 int(na,kind=BLAS_KIND), tmp2, int(na,kind=BLAS_KIND), CZERO, tmp1, int(na,kind=BLAS_KIND))
+#else
+      call CGEMM('C','N', int(nev,kind=BLAS_KIND) , int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND),CONE, z, &
+                 int(na,kind=BLAS_KIND), tmp2, int(na,kind=BLAS_KIND), CZERO, tmp1, int(na,kind=BLAS_KIND))
+#endif
+#endif /* WITH_MPI */
+      ! First check, whether the elements on diagonal are 1 .. "normality" of the vectors
+      err = 0.0_rk
+      do i=1, nev
+        if (map_global_array_index_to_local_index(int(i,kind=c_int), int(i,kind=c_int), row_Local, col_Local, &
+                                                  int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
+                                                  int(my_prow,kind=c_int), int(my_pcol,kind=c_int)) ) then
+           rowLocal = int(row_Local,kind=INT_TYPE)
+           colLocal = int(col_Local,kind=INT_TYPE)
+           err = max(err, abs(tmp1(rowLocal,colLocal) - CONE))
+         endif
+      end do
+#ifdef WITH_MPI
+      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
+#else /* WITH_MPI */
+      errmax = err
+#endif /* WITH_MPI */
+      if (myid==0) print *,'%Maximal error in eigenvector lengths:',errmax
+
+      ! Second, find the maximal error in the whole Z**T * Z matrix (its diference from identity matrix)
+      ! Initialize tmp2 to unit matrix
+      tmp2 = 0
+#ifdef WITH_MPI
+#ifdef DOUBLE_PRECISION_REAL
+      call PZLASET('A', int(nev,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), CZERO, CONE, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else
+      call PCLASET('A', int(nev,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), CZERO, CONE, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#endif
+#else /* WITH_MPI */
+#ifdef DOUBLE_PRECISION_REAL
+      call ZLASET('A',int(nev,kind=BLAS_KIND) ,int(nev,kind=BLAS_KIND) ,CZERO, CONE, tmp2, int(na,kind=BLAS_KIND))
+#else
+      call CLASET('A',int(nev,kind=BLAS_KIND) ,int(nev,kind=BLAS_KIND) ,CZERO, CONE, tmp2, int(na,kind=BLAS_KIND))
+#endif
+#endif /* WITH_MPI */
+
+      !      ! tmp1 = Z**T * Z - Unit Matrix
+      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
+
+      ! Get maximum error (max abs value in tmp1)
+      err = maxval(abs(tmp1))
+#ifdef WITH_MPI
+      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
+#else /* WITH_MPI */
+      errmax = err
+#endif /* WITH_MPI */
+      if (myid==0) print *,'%Error Orthogonality:',errmax
+
+      if (nev .ge. 2) then
+        if (errmax .gt. tol_orth .or. errmax .eq. 0.0_rk) then
+          status = 1
+        endif
+      else
+        if (errmax .gt. tol_orth) then
+          status = 1
+        endif
+      endif
+      
+      deallocate(as_complex)
+    end function
+
+#endif /* REALCASE */
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_ss_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                         double *as, complex double *z, double *ev,  TEST_C_INT_TYPE sc_desc[9],
+    !c>                                         TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#else
+    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_ss_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                         float *as, complex float *z, float *ev, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                         TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#endif
+#endif /* REALCASE */
+
+#if REALCASE == 1
+function check_correctness_evp_numeric_residuals_ss_real_&
+&PRECISION&
+&_f (na, nev, na_rows, na_cols, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status) &
+      bind(C,name="check_correctness_evp_numeric_residuals_ss_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &_f")
+
+      use precision_for_tests
+      use iso_c_binding
+
+      implicit none
+#include "./test_precision_kinds.F90"
+
+      TEST_INT_TYPE            :: status
+      TEST_INT_TYPE, value     :: na, nev, myid, na_rows, na_cols, nblk, np_rows, np_cols, my_prow, my_pcol
+      real(kind=rck)            :: as(1:na_rows,1:na_cols)
+      complex(kind=rck)         :: z(1:na_rows,1:na_cols)
+      real(kind=rck)            :: ev(1:na)
+      TEST_INT_TYPE            :: sc_desc(1:9)
+
+      status = check_correctness_evp_numeric_residuals_ss_real_&
+      &PRECISION&
+      & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+    end function
+#endif /* REALCASE */
+
+function check_correctness_evp_numeric_residuals_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol, bs) result(status)
+ 
+      use tests_blas_interfaces
+      use tests_scalapack_interfaces
+      use precision_for_tests
+      implicit none
+#include "./test_precision_kinds.F90"
+      TEST_INT_TYPE                                 :: status
+      TEST_INT_TYPE, intent(in)                     :: na, nev, nblk, myid, np_rows, np_cols, my_prow, my_pcol
+      MATH_DATATYPE(kind=rck), intent(in)           :: as(:,:), z(:,:)
+      MATH_DATATYPE(kind=rck), intent(in), optional :: bs(:,:)
+      real(kind=rk)                                 :: ev(:)
+      MATH_DATATYPE(kind=rck), dimension(size(as,dim=1),size(as,dim=2)) :: tmp1, tmp2
+      MATH_DATATYPE(kind=rck)                       :: xc
+
+      TEST_INT_TYPE                 :: sc_desc(:)
+
+      TEST_INT_TYPE                 :: i, rowLocal, colLocal
+      integer(kind=c_int)           :: row_Local, col_Local
+      real(kind=rck)                :: err, errmax
+
+      TEST_INT_MPI_TYPE             :: mpierr
+
+! tolerance for the residual test for different math type/precision setups
+      real(kind=rk), parameter       :: tol_res_real_double      = 5e-12_rk
+      real(kind=rk), parameter       :: tol_res_real_single      = 3e-2_rk
+      real(kind=rk), parameter       :: tol_res_complex_double   = 5e-12_rk
+      real(kind=rk), parameter       :: tol_res_complex_single   = 3e-2_rk
+      real(kind=rk)                  :: tol_res                  = tol_res_&
+                                                                          &MATH_DATATYPE&
+                                                                          &_&
+                                                                          &PRECISION
+      ! precision of generalized problem is lower
+      real(kind=rk), parameter       :: generalized_penalty = 10.0_rk
+
+      ! tolerance for the orthogonality test for different math type/precision setups
+      real(kind=rk), parameter       :: tol_orth_real_double     = 5e-11_rk
+      real(kind=rk), parameter       :: tol_orth_real_single     = 9e-2_rk
+      real(kind=rk), parameter       :: tol_orth_complex_double  = 5e-11_rk
+      real(kind=rk), parameter       :: tol_orth_complex_single  = 9e-3_rk
+      real(kind=rk), parameter       :: tol_orth                 = tol_orth_&
+                                                                          &MATH_DATATYPE&
+                                                                          &_&
+                                                                          &PRECISION
+
+      if (present(bs)) then
+        tol_res = generalized_penalty * tol_res
+      endif
+      status = 0
+
+      ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
+     
+!       tmp1 = Zi*EVi
+      tmp1(:,:) = z(:,:)
+      do i=1,nev
+        xc = ev(i)
+#ifdef WITH_MPI
+        call p&
+            &BLAS_CHAR&
+            &scal(na, xc, tmp1, 1_BLAS_KIND, i, sc_desc, 1_BLAS_KIND)
+#else /* WITH_MPI */
+        call BLAS_CHAR&
+            &scal(na, xc, tmp1(:,i), 1_BLAS_KIND)
+#endif /* WITH_MPI */
+      enddo
+
+      ! for generalized EV problem, multiply by bs as well
+      ! tmp2 = B * tmp1
+      if(present(bs)) then
+#ifdef WITH_MPI
+      call scal_PRECISION_GEMM('N', 'N', na, nev, na, ONE, bs, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                               tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      call PRECISION_GEMM('N','N',na,nev,na,ONE,bs,na,tmp1,na,ZERO,tmp2,na)
+#endif /* WITH_MPI */
+      else
+        ! normal eigenvalue problem .. no need to multiply
+        tmp2(:,:) = tmp1(:,:)
+      end if
+
+      ! tmp1 =  A * Z
+      ! as is original stored matrix, Z are the EVs
+#ifdef WITH_MPI
+      call scal_PRECISION_GEMM('N', 'N', na, nev, na, ONE, as, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                  z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      call PRECISION_GEMM('N','N',na,nev,na,ONE,as,na,z,na,ZERO,tmp1,na)
+#endif /* WITH_MPI */
+
+      !  tmp1 = A*Zi - Zi*EVi
+      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
+
+      ! Get maximum norm of columns of tmp1
+      errmax = 0.0_rk
+
+      do i=1,nev
+#if REALCASE == 1
+        err = 0.0_rk
+#ifdef WITH_MPI
+        call scal_PRECISION_NRM2(na, err, tmp1, 1_BLAS_KIND, i, sc_desc, 1_BLAS_KIND)
+#else /* WITH_MPI */
+        err = PRECISION_NRM2(na,tmp1(1,i),1_BLAS_KIND)
+#endif /* WITH_MPI */
+        errmax = max(errmax, err)
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+        xc = 0
+#ifdef WITH_MPI
+        call scal_PRECISION_DOTC(na, xc, tmp1, 1_BLAS_KIND, i, sc_desc, &
+                                 1_BLAS_KIND, tmp1, 1_BLAS_KIND, i, sc_desc, 1_BLAS_KIND)
+#else /* WITH_MPI */
+        xc = PRECISION_DOTC(na,tmp1,1_BLAS_KIND,tmp1,1_BLAS_KIND)
+#endif /* WITH_MPI */
+        errmax = max(errmax, sqrt(real(xc,kind=REAL_DATATYPE)))
+#endif /* COMPLEXCASE */
+      enddo
+
+      ! Get maximum error norm over all processors
+      err = errmax
+#ifdef WITH_MPI
+      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
+#else /* WITH_MPI */
+      errmax = err
+#endif /* WITH_MPI */
+      if (myid==0) print *,'Results of numerical residual checks:'
+      if (myid==0) print *,'Error Residual     :',errmax
+      if (nev .ge. 2) then
+        if (errmax .gt. tol_res .or. errmax .eq. 0.0_rk) then
+          status = 1
+        endif
+      else
+        if (errmax .gt. tol_res) then
+          status = 1
+        endif
+      endif
+
+      ! 2. Eigenvector orthogonality
+      if(present(bs)) then
+        !for the generalized EVP, the eigenvectors should be B-orthogonal, not orthogonal
+        ! tmp2 = B * Z
+        tmp2(:,:) = 0.0_rck
+#ifdef WITH_MPI
+        call scal_PRECISION_GEMM('N', 'N', na, nev, na, ONE, bs, 1_BLAS_KIND, 1_BLAS_KIND, &
+                        sc_desc, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+        call PRECISION_GEMM('N','N', na, nev, na, ONE, bs, na, z, na, ZERO, tmp2, na)
+#endif /* WITH_MPI */
+
+      else
+        tmp2(:,:) = z(:,:)
+      endif
+      ! tmp1 = Z**T * tmp2
+      ! actually tmp1 = Z**T * Z for standard case and tmp1 = Z**T * B * Z for generalized
+      tmp1 = 0
+#ifdef WITH_MPI
+      call scal_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', nev, nev, na, ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, &
+                        sc_desc, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, &
+                        tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      call PRECISION_GEMM(BLAS_TRANS_OR_CONJ,'N',nev,nev,na,ONE,z,na,tmp2,na,ZERO,tmp1,na)
+#endif /* WITH_MPI */
+      ! First check, whether the elements on diagonal are 1 .. "normality" of the vectors
+      err = 0.0_rk
+      do i=1, nev
+        if (map_global_array_index_to_local_index(int(i,kind=c_int), int(i,kind=c_int) , row_Local, col_Local, &
+                                                  int(nblk,kind=c_int), int(np_rows,kind=c_int), &
+                                                  int(np_cols,kind=c_int), int(my_prow,kind=c_int), &
+                                                  int(my_pcol,kind=c_int) )) then
+           rowLocal = int(row_Local,kind=INT_TYPE)
+           colLocal = int(col_Local,kind=INT_TYPE)
+           err = max(err, abs(tmp1(rowLocal,colLocal) - 1.0_rk))
+         endif
+      end do
+#ifdef WITH_MPI
+      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
+#else /* WITH_MPI */
+      errmax = err
+#endif /* WITH_MPI */
+      if (myid==0) print *,'Maximal error in eigenvector lengths:',errmax
+
+      ! Second, find the maximal error in the whole Z**T * Z matrix (its diference from identity matrix)
+      ! Initialize tmp2 to unit matrix
+      tmp2 = 0
+#ifdef WITH_MPI
+      call scal_PRECISION_LASET('A', nev, nev, ZERO, ONE, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      call PRECISION_LASET('A',nev,nev,ZERO,ONE,tmp2,na)
+#endif /* WITH_MPI */
+
+      !      ! tmp1 = Z**T * Z - Unit Matrix
+      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
+
+      ! Get maximum error (max abs value in tmp1)
+      err = maxval(abs(tmp1))
+#ifdef WITH_MPI
+      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
+#else /* WITH_MPI */
+      errmax = err
+#endif /* WITH_MPI */
+      if (myid==0) print *,'Error Orthogonality:',errmax
+
+      if (nev .ge. 2) then
+        if (errmax .gt. tol_orth .or. errmax .eq. 0.0_rk) then
+          status = 1
+        endif
+      else
+        if (errmax .gt. tol_orth) then
+          status = 1
+        endif
+      endif
+    end function
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev,
+    !c>                                                                       TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                                                       double *as, double *z, double *ev, 
+    !c>                                                                       TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                                       TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
+    !c>                                                                       TEST_C_INT_TYPE np_rows, 
+    !c>                                                                       TEST_C_INT_TYPE np_cols, 
+    !c>                                                                       TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#else
+    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
+    !c>                                                                       TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                                                       float *as, float *z, float *ev, 
+    !c>                                                                       TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                                       TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
+    !c>                                                                       TEST_C_INT_TYPE np_rows, 
+    !c>                                                                       TEST_C_INT_TYPE np_cols, 
+    !c>                                                                       TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#endif
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_complex_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
+    !c>                                                              TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                                              complex double *as, complex double *z, double *ev, 
+    !c>                                                              TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                              TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
+    !c>                                                              TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
+    !c>                                                              TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#else
+    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_complex_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
+    !c>                                                                  TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                                                  complex float *as, complex float *z, float *ev, 
+    !c>                                                                  TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                                  TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
+    !c>                                                                  TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
+    !c>                                                                  TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#endif
+#endif /* COMPLEXCASE */
+
+function check_correctness_evp_numeric_residuals_&
+&MATH_DATATYPE&
+&_&
+&PRECISION&
+&_f (na, nev, na_rows, na_cols, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status) &
+      bind(C,name="check_correctness_evp_numeric_residuals_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &_f")
+
+      use precision_for_tests
+      use iso_c_binding
+
+      implicit none
+#include "./test_precision_kinds.F90"
+
+      TEST_INT_TYPE            :: status
+      TEST_INT_TYPE, value     :: na, nev, myid, na_rows, na_cols, nblk, np_rows, np_cols, my_prow, my_pcol
+      MATH_DATATYPE(kind=rck)  :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols)
+      real(kind=rck)           :: ev(1:na)
+      TEST_INT_TYPE            :: sc_desc(1:9)
+
+      status = check_correctness_evp_numeric_residuals_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
+
+    end function
+
+!---- variant for the generalized eigenproblem
+!---- unlike in Fortran, we cannot use optional parameter
+!---- we thus define a different function
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+    !c> TEST_C_INT_TYPE check_correctness_evp_gen_numeric_residuals_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
+    !c>                                                               TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                                               double *as, double *z, double *ev,
+    !c>                                                               TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                               TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
+    !c>                                                               TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
+    !c>                                                               TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol,
+    !c>                                                               double *bs);
+#else
+    !c> TEST_C_INT_TYPE check_correctness_evp_gen_numeric_residuals_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
+    !c>                                                                           TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                                                           float *as, float *z, float *ev, 
+    !c>                                                                           TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                                           TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid,
+    !c>                                                                           TEST_C_INT_TYPE np_rows, 
+    !c>                                                                           TEST_C_INT_TYPE np_cols, 
+    !c>                                                                           TEST_C_INT_TYPE my_prow, 
+    !c>                                                                           TEST_C_INT_TYPE my_pcol, 
+    !c>                                                                           float *bs);
+#endif
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> TEST_C_INT_TYPE check_correctness_evp_gen_numeric_residuals_complex_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev,
+    !c>                                                                    TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                                                    complex double *as, complex double *z, double *ev,
+    !c>                                                                    TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                                    TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
+    !c>                                                                    TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols,
+    !c>                                                                    TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol,
+    !c>                                                                    complex double *bs);
+#else
+    !c> TEST_C_INT_TYPE check_correctness_evp_gen_numeric_residuals_complex_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev,
+    !c>                                                                    TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
+    !c>                                                                    complex float *as, complex float *z, float *ev, 
+    !c>                                                                    TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                                    TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
+    !c>                                                                    TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
+    !c>                                                                    TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol,
+    !c>                                                                    complex float *bs);
+#endif
+#endif /* COMPLEXCASE */
+
+function check_correctness_evp_gen_numeric_residuals_&
+&MATH_DATATYPE&
+&_&
+&PRECISION&
+&_f (na, nev, na_rows, na_cols, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol, bs) result(status) &
+      bind(C,name="check_correctness_evp_gen_numeric_residuals_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      &_f")
+
+      use iso_c_binding
+      use precision_for_tests
+      implicit none
+#include "./test_precision_kinds.F90"
+
+      TEST_INT_TYPE            :: status
+      TEST_INT_TYPE, value     :: na, nev, myid, na_rows, na_cols, nblk, np_rows, np_cols, my_prow, my_pcol
+      MATH_DATATYPE(kind=rck)  :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols), bs(1:na_rows,1:na_cols)
+      real(kind=rck)           :: ev(1:na)
+      TEST_INT_TYPE            :: sc_desc(1:9)
+
+      status = check_correctness_evp_numeric_residuals_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol, bs)
+
+    end function
+
+    !-----------------------------------------------------------------------------------------------------------
+
+    function check_correctness_eigenvalues_toeplitz_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    & (na, diagonalElement, subdiagonalElement, ev, z, myid) result(status)
+      use iso_c_binding
+      use precision_for_tests
+      implicit none
+#include "./test_precision_kinds.F90"
+
+      TEST_INT_TYPE               :: status, ii, j, myid
+      TEST_INT_TYPE, intent(in)   :: na
+      real(kind=rck) :: diagonalElement, subdiagonalElement
+      real(kind=rck) :: ev_analytic(na), ev(na)
+      MATH_DATATYPE(kind=rck) :: z(:,:)
+
+#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
+      real(kind=rck), parameter   :: pi = 3.141592653589793238462643383279_c_double
+#else
+      real(kind=rck), parameter   :: pi = 3.1415926535897932_c_float
+#endif
+      real(kind=rck)              :: tmp, maxerr
+      TEST_INT_TYPE               :: loctmp
+      status = 0
+
+     ! analytic solution
+     do ii=1, na
+       ev_analytic(ii) = diagonalElement + 2.0_rk * &
+                         subdiagonalElement *cos( pi*real(ii,kind=rk)/ &
+                         real(na+1,kind=rk) )
+     enddo
+
+     ! sort analytic solution:
+
+     ! this hack is neither elegant, nor optimized: for huge matrixes it might be expensive
+     ! a proper sorting algorithmus might be implemented here
+
+     tmp    = minval(ev_analytic)
+     loctmp = minloc(ev_analytic, 1)
+
+     ev_analytic(loctmp) = ev_analytic(1)
+     ev_analytic(1) = tmp
+     do ii=2, na
+       tmp = ev_analytic(ii)
+       do j= ii, na
+         if (ev_analytic(j) .lt. tmp) then
+           tmp    = ev_analytic(j)
+           loctmp = j
+         endif
+       enddo
+       ev_analytic(loctmp) = ev_analytic(ii)
+       ev_analytic(ii) = tmp
+     enddo
+
+     ! compute a simple error max of eigenvalues
+     maxerr = 0.0
+     maxerr = maxval( (ev(:) - ev_analytic(:))/ev_analytic(:) , 1)
+
+#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
+     if (maxerr .gt. 8.e-13_c_double .or. maxerr .eq. 0.0_c_double) then
+#else
+     if (maxerr .gt. 8.e-4_c_float .or. maxerr .eq. 0.0_c_float) then
+#endif
+       status = 1
+       if (myid .eq. 0) then
+         print *,"Result of Toeplitz matrix test: "
+         print *,"Eigenvalues differ from analytic solution: maxerr = ",maxerr
+       endif
+     endif
+
+    if (status .eq. 0) then
+       if (myid .eq. 0) then
+         print *,"Result of Toeplitz matrix test: test passed"
+         print *,"Eigenvalues differ from analytic solution: maxerr = ",maxerr
+       endif
+    endif
+    end function
+
+    function check_correctness_cholesky_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    & (na, a, as, na_rows, sc_desc, myid) result(status)
+      use precision_for_tests
+      use tests_blas_interfaces
+      use tests_scalapack_interfaces
+      implicit none
+#include "./test_precision_kinds.F90"
+      TEST_INT_TYPE                                                     :: status
+      TEST_INT_TYPE, intent(in)                                         :: na, myid, na_rows
+
+      MATH_DATATYPE(kind=rck), intent(in)                               :: a(:,:), as(:,:)
+      MATH_DATATYPE(kind=rck), dimension(size(as,dim=1),size(as,dim=2)) :: tmp1, tmp2
+#if COMPLEXCASE == 1
+      ! needed for [z,c]lange from scalapack
+      real(kind=rk), dimension(2*size(as,dim=1),size(as,dim=2))         :: tmp1_real
+#endif
+      real(kind=rk)                                                     :: norm, normmax
+
+      TEST_INT_TYPE                                                     :: sc_desc(:)
+      real(kind=rck)                                                    :: err, errmax
+      TEST_INT_MPI_TYPE                                                 :: mpierr
+
+      status = 0
+      tmp1(:,:) = 0.0_rck
+
+
+#if REALCASE == 1
+      ! tmp1 = a**T
+#ifdef WITH_MPI
+      call p&
+          &BLAS_CHAR&
+          &tran(na, na, 1.0_rck, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                0.0_rck, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      tmp1 = transpose(a)
+#endif /* WITH_MPI */
+#endif /* REALCASE == 1 */
+
+#if COMPLEXCASE == 1
+      ! tmp1 = a**H
+#ifdef WITH_MPI
+      call p&
+            &BLAS_CHAR&
+            &tranc(na, na, ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                   ZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      tmp1 = transpose(conjg(a))
+#endif /* WITH_MPI */
+#endif /* COMPLEXCASE == 1 */
+
+      ! tmp2 = a**T * a
+#ifdef WITH_MPI
+      call p&
+            &BLAS_CHAR&
+            &gemm("N","N", na, na, na, ONE, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                  a, 1_BLAS_KIND, 1_BLAS_KIND, &
+                  sc_desc, ZERO, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      call BLAS_CHAR&
+                    &gemm("N","N", na, na, na, ONE, tmp1, na, a, na, ZERO, tmp2, na)
+#endif /* WITH_MPI */
+
+      ! compare tmp2 with original matrix
+      tmp2(:,:) = tmp2(:,:) - as(:,:)
+
+#ifdef WITH_MPI
+      norm = p&
+              &BLAS_CHAR&
+              &lange("M",na, na, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+#if COMPLEXCASE == 1
+              tmp1_real)
+#else
+              tmp1)
+#endif
+#else /* WITH_MPI */
+      norm = BLAS_CHAR&
+             &lange("M", na, na, tmp2, na_rows, &
+#if COMPLEXCASE == 1
+             tmp1_real)
+#else
+             tmp1)
+#endif
+#endif /* WITH_MPI */
+
+
+#ifdef WITH_MPI
+      call mpi_allreduce(norm, normmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
+#else /* WITH_MPI */
+      normmax = norm
+#endif /* WITH_MPI */
+
+      if (myid .eq. 0) then
+        print *," Maximum error of result: ", normmax
+      endif
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+!      if (normmax .gt. 5e-12_rk8 .or. normmax .eq. 0.0_rk8) then
+      if (normmax .gt. 5e-12_rk8) then
+        status = 1
+      endif
+#else
+!      if (normmax .gt. 5e-4_rk4 .or. normmax .eq. 0.0_rk4) then
+      if (normmax .gt. 5e-4_rk4 ) then
+        status = 1
+      endif
+#endif
+#endif
+
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+!      if (normmax .gt. 5e-11_rk8 .or. normmax .eq. 0.0_rk8) then
+      if (normmax .gt. 5e-11_rk8 ) then
+        status = 1
+      endif
+#else
+!      if (normmax .gt. 5e-3_rk4 .or. normmax .eq. 0.0_rk4) then
+      if (normmax .gt. 5e-3_rk4) then
+        status = 1
+      endif
+#endif
+#endif
+    end function
+
+    function check_correctness_hermitian_multiply_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    & (na, a, b, c, na_rows, sc_desc, myid) result(status)
+      use precision_for_tests
+      use tests_blas_interfaces
+      use tests_scalapack_interfaces
+      implicit none
+#include "./test_precision_kinds.F90"
+      TEST_INT_TYPE                                                   :: status
+      TEST_INT_TYPE, intent(in)                                       :: na, myid, na_rows
+      MATH_DATATYPE(kind=rck), intent(in)                             :: a(:,:), b(:,:), c(:,:)
+      MATH_DATATYPE(kind=rck), dimension(size(a,dim=1),size(a,dim=2)) :: tmp1, tmp2
+#if COMPLEXCASE == 1
+      real(kind=rk), dimension(2*size(a,dim=1),size(a,dim=2))         :: tmp1_real
+#endif
+      real(kind=rck)                                                  :: norm, normmax
+
+
+      TEST_INT_TYPE                                                   :: sc_desc(:)
+      real(kind=rck)                                                  :: err, errmax
+      TEST_INT_MPI_TYPE                                               :: mpierr
+
+      status = 0
+      tmp1(:,:) = ZERO
+
+#if REALCASE == 1
+      ! tmp1 = a**T
+#ifdef WITH_MPI
+      call p&
+            &BLAS_CHAR&
+            &tran(na, na, ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      tmp1 = transpose(a)
+#endif /* WITH_MPI */
+
+#endif /* REALCASE == 1 */
+
+#if COMPLEXCASE == 1
+      ! tmp1 = a**H
+#ifdef WITH_MPI
+      call p&
+            &BLAS_CHAR&
+            &tranc(na, na, ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else /* WITH_MPI */
+      tmp1 = transpose(conjg(a))
+#endif /* WITH_MPI */
+#endif /* COMPLEXCASE == 1 */
+
+   ! tmp2 = tmp1 * b
+#ifdef WITH_MPI
+   call p&
+         &BLAS_CHAR&
+         &gemm("N","N", na, na, na, ONE, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, b, 1_BLAS_KIND, 1_BLAS_KIND, &
+               sc_desc, ZERO, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
+#else
+   call BLAS_CHAR&
+        &gemm("N","N", na, na, na, ONE, tmp1, na, b, na, ZERO, tmp2, na)
+#endif
+
+      ! compare tmp2 with c
+      tmp2(:,:) = tmp2(:,:) - c(:,:)
+
+#ifdef WITH_MPI
+      ! dirty hack: the last argument should be a real array, but is not referenced
+      ! if mode = "M", thus we get away with a complex argument
+      norm = p&
+              &BLAS_CHAR&
+              &lange("M", na, na, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+#if COMPLEXCASE == 1              
+              tmp1_real)
+#else
+              tmp1)
+#endif
+#else /* WITH_MPI */
+      ! dirty hack: the last argument should be a real array, but is not referenced
+      ! if mode = "M", thus we get away with a complex argument
+      norm = BLAS_CHAR&
+             &lange("M", na, na, tmp2, na_rows, &
+#if COMPLEXCASE == 1              
+              tmp1_real)
+#else
+              tmp1)
+#endif
+#endif /* WITH_MPI */
+
+#ifdef WITH_MPI
+      call mpi_allreduce(norm, normmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
+#else /* WITH_MPI */
+      normmax = norm
+#endif /* WITH_MPI */
+
+      if (myid .eq. 0) then
+        print *," Maximum error of result: ", normmax
+      endif
+
+#ifdef DOUBLE_PRECISION_REAL
+      if (normmax .gt. 5e-11_rk8 ) then
+        status = 1
+      endif
+#else
+      if (normmax .gt. 5e-3_rk4 ) then
+        status = 1
+      endif
+#endif
+
+#ifdef DOUBLE_PRECISION_COMPLEX
+      if (normmax .gt. 5e-11_rk8 ) then
+        status = 1
+      endif
+#else
+      if (normmax .gt. 5e-3_rk4 ) then
+        status = 1
+      endif
+#endif
+    end function
+
+    function check_correctness_eigenvalues_frank_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    & (na, ev, z, myid) result(status)
+      use iso_c_binding
+      use precision_for_tests
+      implicit none
+#include "./test_precision_kinds.F90"
+
+      TEST_INT_TYPE                   :: status, i, j, myid
+      TEST_INT_TYPE, intent(in)       :: na
+      real(kind=rck)            :: ev_analytic(na), ev(na)
+      MATH_DATATYPE(kind=rck)   :: z(:,:)
+
+#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
+      real(kind=rck), parameter :: pi = 3.141592653589793238462643383279_c_double
+#else
+      real(kind=rck), parameter :: pi = 3.1415926535897932_c_float
+#endif
+      real(kind=rck)            :: tmp, maxerr
+      TEST_INT_TYPE                  :: loctmp
+      status = 0
+
+     ! analytic solution
+     do i = 1, na
+       j = na - i
+#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
+       ev_analytic(i) = pi * (2.0_c_double * real(j,kind=c_double) + 1.0_c_double) / &
+           (2.0_c_double * real(na,kind=c_double) + 1.0_c_double)
+       ev_analytic(i) = 0.5_c_double / (1.0_c_double - cos(ev_analytic(i)))
+#else
+       ev_analytic(i) = pi * (2.0_c_float * real(j,kind=c_float) + 1.0_c_float) / &
+           (2.0_c_float * real(na,kind=c_float) + 1.0_c_float)
+       ev_analytic(i) = 0.5_c_float / (1.0_c_float - cos(ev_analytic(i)))
+#endif
+     enddo
+
+     ! sort analytic solution:
+
+     ! this hack is neither elegant, nor optimized: for huge matrixes it might be expensive
+     ! a proper sorting algorithmus might be implemented here
+
+     tmp    = minval(ev_analytic)
+     loctmp = minloc(ev_analytic, 1)
+
+     ev_analytic(loctmp) = ev_analytic(1)
+     ev_analytic(1) = tmp
+     do i=2, na
+       tmp = ev_analytic(i)
+       do j= i, na
+         if (ev_analytic(j) .lt. tmp) then
+           tmp    = ev_analytic(j)
+           loctmp = j
+         endif
+       enddo
+       ev_analytic(loctmp) = ev_analytic(i)
+       ev_analytic(i) = tmp
+     enddo
+
+     ! compute a simple error max of eigenvalues
+     maxerr = 0.0
+     maxerr = maxval( (ev(:) - ev_analytic(:))/ev_analytic(:) , 1)
+
+#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
+     if (maxerr .gt. 8.e-13_c_double) then
+#else
+     if (maxerr .gt. 8.e-4_c_float) then
+#endif
+       status = 1
+       if (myid .eq. 0) then
+         print *,"Result of Frank matrix test: "
+         print *,"Eigenvalues differ from analytic solution: maxerr = ",maxerr
+       endif
+     endif
+    end function
+
+! vim: syntax=fortran
diff -Nru elpa-2016.05.001/test/shared/test_output_type.F90 elpa-2019.11.001/test/shared/test_output_type.F90
--- elpa-2016.05.001/test/shared/test_output_type.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_output_type.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,11 @@
+#include "config-f90.h"
+
+module test_output_type
+
+ type :: output_t
+   logical :: eigenvectors
+   logical :: eigenvalues
+ end type
+
+
+end module
diff -Nru elpa-2016.05.001/test/shared/test_precision_kinds.F90 elpa-2019.11.001/test/shared/test_precision_kinds.F90
--- elpa-2016.05.001/test/shared/test_precision_kinds.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_precision_kinds.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,25 @@
+#ifdef REALCASE
+#ifdef DOUBLE_PRECISION
+  integer, parameter :: rk = C_DOUBLE
+  integer, parameter :: rck = C_DOUBLE
+#endif
+#ifdef SINGLE_PRECISION
+  integer, parameter :: rk = C_FLOAT
+  integer, parameter :: rck = C_FLOAT
+#endif
+  real(kind=rck), parameter      :: ZERO=0.0_rk, ONE = 1.0_rk
+#endif
+
+#ifdef COMPLEXCASE
+#ifdef DOUBLE_PRECISION
+  integer, parameter :: rk = C_DOUBLE
+  integer, parameter :: ck = C_DOUBLE_COMPLEX
+  integer, parameter :: rck = C_DOUBLE_COMPLEX
+#endif
+#ifdef SINGLE_PRECISION
+  integer, parameter :: rk = C_FLOAT
+  integer, parameter :: ck = C_FLOAT_COMPLEX
+  integer, parameter :: rck = C_FLOAT_COMPLEX
+#endif
+  complex(kind=rck), parameter     :: ZERO = (0.0_rk,0.0_rk), ONE = (1.0_rk,0.0_rk)
+#endif
diff -Nru elpa-2016.05.001/test/shared/test_prepare_matrix.F90 elpa-2019.11.001/test/shared/test_prepare_matrix.F90
--- elpa-2016.05.001/test/shared/test_prepare_matrix.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_prepare_matrix.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,145 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: A. Marek, MPCDF
+#include "config-f90.h"
+
+module test_prepare_matrix
+
+  use precision_for_tests
+  interface prepare_matrix_random
+    module procedure prepare_matrix_random_complex_double
+    module procedure prepare_matrix_random_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure prepare_matrix_random_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure prepare_matrix_random_complex_single
+#endif
+   end interface
+
+
+  interface prepare_matrix_random_spd
+    module procedure prepare_matrix_random_spd_complex_double
+    module procedure prepare_matrix_random_spd_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure prepare_matrix_random_spd_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure prepare_matrix_random_spd_complex_single
+#endif
+   end interface
+
+
+  interface prepare_matrix_toeplitz
+    module procedure prepare_matrix_toeplitz_complex_double
+    module procedure prepare_matrix_toeplitz_real_double
+    module procedure prepare_matrix_toeplitz_mixed_complex_complex_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure prepare_matrix_toeplitz_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure prepare_matrix_toeplitz_complex_single
+    module procedure prepare_matrix_toeplitz_mixed_complex_complex_single
+#endif
+   end interface
+
+  interface prepare_matrix_frank
+    module procedure prepare_matrix_frank_complex_double
+    module procedure prepare_matrix_frank_real_double
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure prepare_matrix_frank_real_single
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure prepare_matrix_frank_complex_single
+#endif
+   end interface
+
+
+
+   private prows, pcols, map_global_array_index_to_local_index
+
+  contains
+
+#include "../../src/general/prow_pcol.F90"
+#include "../../src/general/map_global_to_local.F90"
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_prepare_matrix_template.F90"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_prepare_matrix_template.F90"
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_prepare_matrix_template.F90"
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_prepare_matrix_template.F90"
+#undef SINGLE_PRECISION
+#undef REALCASE
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+
+end module
diff -Nru elpa-2016.05.001/test/shared/test_prepare_matrix_template.F90 elpa-2019.11.001/test/shared/test_prepare_matrix_template.F90
--- elpa-2016.05.001/test/shared/test_prepare_matrix_template.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_prepare_matrix_template.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,510 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! Author: A. Marek, MPCDF
+
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#define TEST_C_INT_TYPE_PTR long int*
+#define TEST_C_INT_TYPE long int
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#define TEST_C_INT_TYPE_PTR int*
+#define TEST_C_INT_TYPE int
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#define TEST_C_INT_MPI_TYPE_PTR long int*
+#define TEST_C_INT_MPI_TYPE long int
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#define TEST_C_INT_MPI_TYPE_PTR int*
+#define TEST_C_INT_MPI_TYPE int
+#endif
+
+
+    subroutine prepare_matrix_random_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    & (na, myid, sc_desc, a, z, as, is_skewsymmetric)
+
+
+      !use test_util
+      use tests_scalapack_interfaces
+
+      implicit none
+#include "./test_precision_kinds.F90"
+      TEST_INT_TYPE, intent(in)                 :: myid, na, sc_desc(:)
+      MATH_DATATYPE(kind=rck), intent(inout)    :: z(:,:), a(:,:), as(:,:)
+
+#if COMPLEXCASE == 1
+      real(kind=rk)                             :: xr(size(a,dim=1), size(a,dim=2))
+#endif /* COMPLEXCASE */
+
+      integer(kind=c_int), allocatable          :: iseed(:)
+      integer(kind=c_int)                       ::  n
+      integer(kind=c_int), intent(in), optional :: is_skewsymmetric
+      logical                                   :: skewsymmetric
+
+      if (present(is_skewsymmetric)) then
+        if (is_skewsymmetric .eq. 1) then
+          skewsymmetric = .true.
+        else
+          skewsymmetric = .false.
+        endif      
+      else
+        skewsymmetric = .false.
+      endif
+
+      ! for getting a hermitian test matrix A we get a random matrix Z
+      ! and calculate A = Z + Z**H
+      ! in case of a skewsymmetric matrix A = Z - Z**H
+
+      ! we want different random numbers on every process
+      ! (otherwise A might get rank deficient):
+
+      call random_seed(size=n)
+      allocate(iseed(n))
+      iseed(:) = myid
+      call random_seed(put=iseed)
+#if REALCASE == 1
+      call random_number(z)
+
+      a(:,:) = z(:,:)
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+      call random_number(xr)
+
+      z(:,:) = xr(:,:)
+      call RANDOM_NUMBER(xr)
+      z(:,:) = z(:,:) + (0.0_rk,1.0_rk)*xr(:,:)
+      a(:,:) = z(:,:)
+#endif /* COMPLEXCASE */
+
+      if (myid == 0) then
+        print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
+      endif
+
+#if REALCASE == 1
+#ifdef WITH_MPI
+      if (skewsymmetric) then
+        call p&
+             &BLAS_CHAR&
+             &tran(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), -ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                   ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc) ! A = A + Z**T
+      else
+        call p&
+             &BLAS_CHAR&
+             &tran(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                   ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc) ! A = A + Z**T
+      endif
+#else /* WITH_MPI */
+      if (skewsymmetric) then
+        a = a - transpose(z)
+      else
+        a = a + transpose(z)
+      endif
+#endif /* WITH_MPI */
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+#ifdef WITH_MPI
+      if (skewsymmetric) then
+        call p&
+             &BLAS_CHAR&
+             &tranc(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), -ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                    ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc) ! A = A + Z**H
+      else
+        call p&
+             &BLAS_CHAR&
+             &tranc(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
+                    ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc) ! A = A + Z**H
+      endif
+#else /* WITH_MPI */
+      if (skewsymmetric) then
+        a = a - transpose(conjg(z))
+      else
+        a = a + transpose(conjg(z))
+      endif
+#endif /* WITH_MPI */
+#endif /* COMPLEXCASE */
+
+
+      if (myid == 0) then
+        print '(a)','| Random matrix block has been symmetrized'
+      endif
+
+      ! save original matrix A for later accuracy checks
+
+      as = a
+
+      deallocate(iseed)
+
+    end subroutine
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+    !c> void prepare_matrix_random_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
+    !c>                                          TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                          double *a, double *z, double *as);
+#else
+    !c> void prepare_matrix_random_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
+    !c>                                          TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                          float *a, float *z, float *as);
+#endif
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> void prepare_matrix_random_complex_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
+    !c>                                             TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                             complex double *a, complex double *z, complex double *as);
+#else
+    !c> void prepare_matrix_random_complex_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
+    !c>                                             TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                             complex float *a, complex float *z, complex float *as);
+#endif
+#endif /* COMPLEXCASE */
+
+subroutine prepare_matrix_random_&
+&MATH_DATATYPE&
+&_wrapper_&
+&PRECISION&
+& (na, myid, na_rows, na_cols, sc_desc, a, z, as) &
+   bind(C, name="prepare_matrix_random_&
+   &MATH_DATATYPE&
+   &_&
+   &PRECISION&
+   &_f")
+      use iso_c_binding
+
+      implicit none
+#include "./test_precision_kinds.F90"
+
+      TEST_INT_TYPE , value   :: myid, na, na_rows, na_cols
+      TEST_INT_TYPE           :: sc_desc(1:9)
+      MATH_DATATYPE(kind=rck) :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols),  &
+                                 as(1:na_rows,1:na_cols)
+      call prepare_matrix_random_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      & (na, myid, sc_desc, a, z, as)
+    end subroutine
+
+!----------------------------------------------------------------------------------------------------------------
+
+    subroutine prepare_matrix_random_spd_&
+    &MATH_DATATYPE&
+    &_&
+    &PRECISION&
+    & (na, myid, sc_desc, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
+
+      !use test_util
+      use precision_for_tests
+      implicit none
+#include "./test_precision_kinds.F90"
+      TEST_INT_TYPE, intent(in)              :: myid, na, sc_desc(:)
+      MATH_DATATYPE(kind=rck), intent(inout) :: z(:,:), a(:,:), as(:,:)
+      TEST_INT_TYPE, intent(in)              ::  nblk, np_rows, np_cols, my_prow, my_pcol
+
+      TEST_INT_TYPE                          :: ii
+      integer(kind=c_int)                    :: rowLocal, colLocal
+
+
+      call prepare_matrix_random_&
+        &MATH_DATATYPE&
+        &_&
+        &PRECISION&
+        & (na, myid, sc_desc, a, z, as)
+
+      ! hermitian diagonaly dominant matrix => positive definite
+      do ii=1, na
+        if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii,kind=c_int), &
+                                                  rowLocal, colLocal, &
+                                                  int(nblk,kind=c_int), int(np_rows,kind=c_int),      &
+                                                  int(np_cols,kind=c_int), int(my_prow,kind=c_int),  &
+                                                  int(my_pcol,kind=c_int) )) then
+          a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = &
+                  real(a(int(rowLocal,kind=INT_TYPE), int(colLocal,kind=INT_TYPE))) + na + 1
+        end if
+      end do
+
+      as = a
+
+   end subroutine
+
+#if REALCASE == 1
+#ifdef DOUBLE_PRECISION_REAL
+    !c> void prepare_matrix_random_spd_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
+    !c>                                              TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                              double *a, double *z, double *as,
+    !c>                                              TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
+    !c>                                              TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#else
+    !c> void prepare_matrix_random_spd_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
+    !c>                                              TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                              float *a, float *z, float *as,
+    !c>                                              TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols,
+    !c>                                              TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#endif
+#endif /* REALCASE */
+
+#if COMPLEXCASE == 1
+#ifdef DOUBLE_PRECISION_COMPLEX
+    !c> void prepare_matrix_random_spd_complex_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
+    !c>                                                 TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                 complex double *a, complex double *z, complex double *as,
+    !c>                                                 TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE np_rows, 
+    !c>                                                 TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#else
+    !c> void prepare_matrix_random_spd_complex_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows,
+    !c>                                                 TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
+    !c>                                                 complex float *a, complex float *z, complex float *as,
+    !c>                                                 TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE np_rows, 
+    !c>                                                 TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
+#endif
+#endif /* COMPLEXCASE */
+
+subroutine prepare_matrix_random_spd_&
+&MATH_DATATYPE&
+&_wrapper_&
+&PRECISION&
+& (na, myid, na_rows, na_cols, sc_desc, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol) &
+   bind(C, name="prepare_matrix_random_spd_&
+   &MATH_DATATYPE&
+   &_&
+   &PRECISION&
+   &_f")
+      use iso_c_binding
+
+      implicit none
+#include "./test_precision_kinds.F90"
+
+      TEST_INT_TYPE , value   :: myid, na, na_rows, na_cols
+      TEST_INT_TYPE           :: sc_desc(1:9)
+      MATH_DATATYPE(kind=rck) :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols),  &
+                                 as(1:na_rows,1:na_cols)
+      TEST_INT_TYPE , value   :: nblk, np_rows, np_cols, my_prow, my_pcol
+      call prepare_matrix_random_spd_&
+      &MATH_DATATYPE&
+      &_&
+      &PRECISION&
+      & (na, myid, sc_desc, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
+    end subroutine
+
+
+!----------------------------------------------------------------------------------------------------------------
+
+   subroutine prepare_matrix_toeplitz_&
+   &MATH_DATATYPE&
+   &_&
+   &PRECISION&
+   & (na, diagonalElement, subdiagonalElement, d, sd, ds, sds, a, as, &
+      nblk, np_rows, np_cols, my_prow, my_pcol)
+     !use test_util
+     use precision_for_tests
+     implicit none
+#include "./test_precision_kinds.F90"
+
+     TEST_INT_TYPE, intent(in) :: na, nblk, np_rows, np_cols, my_prow, my_pcol
+     MATH_DATATYPE(kind=rck)   :: diagonalElement, subdiagonalElement
+     MATH_DATATYPE(kind=rck)   :: d(:), sd(:), ds(:), sds(:)
+     MATH_DATATYPE(kind=rck)   :: a(:,:), as(:,:)
+
+     TEST_INT_TYPE             :: ii
+     integer(kind=c_int)       :: rowLocal, colLocal
+
+     d(:) = diagonalElement
+     sd(:) = subdiagonalElement
+     a(:,:) = ZERO
+
+     ! set up the diagonal and subdiagonals (for general solver test)
+     do ii=1, na ! for diagonal elements
+       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii,kind=c_int), rowLocal, &
+                                                 colLocal, int(nblk,kind=c_int), int(np_rows,kind=c_int), &
+                                                 int(np_cols,kind=c_int), int(my_prow,kind=c_int), &
+                                                 int(my_pcol,kind=c_int) ) ) then
+         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = diagonalElement
+       endif
+     enddo
+     do ii=1, na-1
+       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii+1,kind=c_int), rowLocal, &
+                                                 colLocal, int(nblk,kind=c_int), int(np_rows,kind=c_int), &
+                                                 int(np_cols,kind=c_int), int(my_prow,kind=c_int), &
+                                                 int(my_pcol,kind=c_int) ) ) then
+         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = subdiagonalElement
+       endif
+     enddo
+
+     do ii=2, na
+       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii-1,kind=c_int), rowLocal, &
+                                                 colLocal, int(nblk,kind=c_int), int(np_rows,kind=c_int), &
+                                                 int(np_cols,kind=c_int), int(my_prow,kind=c_int), &
+                                                 int(my_pcol,kind=c_int) ) ) then
+         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = subdiagonalElement
+       endif
+     enddo
+
+     ds = d
+     sds = sd
+     as = a
+   end subroutine
+
+   subroutine prepare_matrix_toeplitz_mixed_complex&
+   &_&
+   &MATH_DATATYPE&
+   &_&
+   &PRECISION&
+#if COMPLEXCASE == 1
+   & (na, diagonalElement, subdiagonalElement, d, sd, ds, sds, a, as, &
+      nblk, np_rows, np_cols, my_prow, my_pcol)
+#endif
+#if REALCASE == 1
+   & (na, diagonalElement, subdiagonalElement, d, sd, ds, sds, &
+      nblk, np_rows, np_cols, my_prow, my_pcol)
+#endif
+     !use test_util
+     implicit none
+
+     TEST_INT_TYPE, intent(in)     :: na, nblk, np_rows, np_cols, my_prow, my_pcol
+     real(kind=C_DATATYPE_KIND)    :: diagonalElement, subdiagonalElement
+
+     real(kind=C_DATATYPE_KIND)    :: d(:), sd(:), ds(:), sds(:)
+
+#if COMPLEXCASE == 1
+     complex(kind=C_DATATYPE_KIND) :: a(:,:), as(:,:)
+#endif
+#if REALCASE == 1
+#endif
+
+     TEST_INT_TYPE                 :: ii
+     integer(kind=c_int)           :: rowLocal, colLocal
+#if COMPLEXCASE == 1
+     d(:) = diagonalElement
+     sd(:) = subdiagonalElement
+
+     ! set up the diagonal and subdiagonals (for general solver test)
+     do ii=1, na ! for diagonal elements
+       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii,kind=c_int), rowLocal, &
+                                                 colLocal, int(nblk,kind=c_int),                   &
+                                                 int(np_rows,kind=c_int), int(np_cols,kind=c_int),                 &
+                                                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
+         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = diagonalElement
+       endif
+     enddo
+     do ii=1, na-1
+       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii+1,kind=c_int), rowLocal, &
+                                                 colLocal, int(nblk,kind=c_int),                   &
+                                                 int(np_rows,kind=c_int), int(np_cols,kind=c_int),                 &
+                                                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
+         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = subdiagonalElement
+       endif
+     enddo
+
+     do ii=2, na
+       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii-1,kind=c_int), rowLocal, &
+                                                 colLocal, int(nblk,kind=c_int),                   &
+                                                 int(np_rows,kind=c_int), int(np_cols,kind=c_int),                 &
+                                                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
+         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = subdiagonalElement
+       endif
+     enddo
+
+     ds = d
+     sds = sd
+     as = a
+#endif
+   end subroutine
+
+   subroutine prepare_matrix_frank_&
+   &MATH_DATATYPE&
+   &_&
+   &PRECISION&
+   & (na, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
+     !use test_util
+     use precision_for_tests
+     implicit none
+
+     TEST_INT_TYPE, intent(in)     :: na, nblk, np_rows, np_cols, my_prow, my_pcol
+
+#if REALCASE == 1
+     real(kind=C_DATATYPE_KIND)    :: a(:,:), z(:,:), as(:,:)
+#endif
+#if COMPLEXCASE == 1
+     complex(kind=C_DATATYPE_KIND) :: a(:,:), z(:,:), as(:,:)
+#endif
+
+     TEST_INT_TYPE                 :: i, j
+     integer(kind=c_int)           :: rowLocal, colLocal
+
+     do i = 1, na
+       do j = 1, na
+         if (map_global_array_index_to_local_index(int(i,kind=c_int), int(j,kind=c_int), rowLocal, &
+                                                 colLocal, int(nblk,kind=c_int),                   &
+                                                 int(np_rows,kind=c_int), int(np_cols,kind=c_int),                 &
+                                                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
+           if (j .le. i) then
+             a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = &
+                     real((na+1-i), kind=C_DATATYPE_KIND) / real(na, kind=C_DATATYPE_KIND)
+           else
+             a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = &
+                     real((na+1-j), kind=C_DATATYPE_KIND) / real(na, kind=C_DATATYPE_KIND)
+           endif
+         endif
+       enddo
+     enddo
+
+     z(:,:)  = a(:,:)
+     as(:,:) = a(:,:)
+
+   end subroutine
+
+
+! vim: syntax=fortran
diff -Nru elpa-2016.05.001/test/shared/test_read_input_parameters.F90 elpa-2019.11.001/test/shared/test_read_input_parameters.F90
--- elpa-2016.05.001/test/shared/test_read_input_parameters.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_read_input_parameters.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,455 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+
+module test_read_input_parameters
+  use elpa, only : ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_2STAGE_REAL_DEFAULT, elpa_int_string_to_value, &
+                   elpa_int_value_to_string, ELPA_OK
+  use elpa_utilities, only : error_unit
+  use iso_c_binding
+  use test_util, only : x_ao, x_a
+  use test_output_type
+
+  implicit none
+
+  type input_options_t
+    TEST_INT_TYPE        :: datatype
+    TEST_INT_TYPE        :: na, nev, nblk
+    type(output_t) :: write_to_file
+    TEST_INT_TYPE        :: this_real_kernel, this_complex_kernel
+    logical        :: realKernelIsSet, complexKernelIsSet
+    TEST_INT_TYPE        :: useQrIsSet, useGPUIsSet
+    logical        :: doSolveTridi, do1stage, do2stage, justHelpMessage, &
+                      doCholesky, doInvertTrm, doTransposeMultiply
+  end type
+
+  interface read_input_parameters
+    module procedure read_input_parameters_general
+    module procedure read_input_parameters_traditional
+    module procedure read_input_parameters_traditional_noskip
+  end interface
+
+  contains
+
+    subroutine parse_arguments(command_line_argument, input_options)
+      implicit none
+
+      type(input_options_t)  :: input_options
+      character(len=128)     :: command_line_argument
+      integer(kind=c_int)    :: elpa_error
+
+      if (command_line_argument == "--help") then
+        print *,"usage: elpa_tests [--help] [datatype={real|complex}] [na=number] [nev=number] "
+        print *,"                  [nblk=size of block cyclic distribution] [--output_eigenvalues]"
+        print *,"                  [--output_eigenvectors] [--real-kernel=name_of_kernel]"
+        print *,"                  [--complex-kernel=name_of_kernel] [--use-gpu={0|1}]"
+        print *,"                  [--use-qr={0,1}] [--tests={all|solve-tridi|1stage|2stage|cholesky&
+            &|invert-triangular|transpose-mulitply}]"
+        input_options%justHelpMessage=.true.
+        return
+      endif
+
+
+      if (command_line_argument(1:11) == "--datatype=") then
+        if (command_line_argument(12:15) == "real") then
+          input_options%datatype=1
+        else
+          if (command_line_argument(12:18) == "complex") then
+            input_options%datatype=2
+          else
+            print *,"datatype unknown! use either --datatype=real or --datatpye=complex"
+            stop 1
+          endif
+        endif
+      endif
+
+      if (command_line_argument(1:3) == "na=") then
+        read(command_line_argument(4:), *) input_options%na
+      endif
+      if (command_line_argument(1:4) == "nev=") then
+        read(command_line_argument(5:), *) input_options%nev
+      endif
+      if (command_line_argument(1:5) == "nblk=") then
+        read(command_line_argument(6:), *) input_options%nblk
+      endif
+
+      if (command_line_argument(1:21)   == "--output_eigenvectors") then
+        input_options%write_to_file%eigenvectors = .true.
+      endif
+
+      if (command_line_argument(1:20)   == "--output_eigenvalues") then
+        input_options%write_to_file%eigenvalues = .true.
+      endif
+
+      if (command_line_argument(1:14) == "--real-kernel=") then
+        input_options%this_real_kernel = int(elpa_int_string_to_value("real_kernel",     &
+                                             command_line_argument(15:), elpa_error), &
+                                             kind=INT_TYPE)
+        if (elpa_error /= ELPA_OK) then
+          print *, "Invalid argument for --real-kernel"
+          stop 1
+        endif
+        print *,"Setting ELPA2 real kernel to ", elpa_int_value_to_string("real_kernel", &
+                                                                        int(input_options%this_real_kernel,kind=c_int))
+        input_options%realKernelIsSet = .true.
+      endif
+
+      if (command_line_argument(1:17) == "--complex-kernel=") then
+        input_options%this_complex_kernel = int(elpa_int_string_to_value("complex_kernel",    &
+                                                command_line_argument(18:), elpa_error), kind=INT_TYPE)
+        if (elpa_error /= ELPA_OK) then
+          print *, "Invalid argument for --complex-kernel"
+          stop 1
+        endif
+        print *,"Setting ELPA2 complex kernel to ", elpa_int_value_to_string("complex_kernel", &
+                                                                          int(input_options%this_complex_kernel,kind=c_int))
+        input_options%complexKernelIsSet = .true.
+      endif
+
+      if (command_line_argument(1:9) == "--use-qr=") then
+        read(command_line_argument(10:), *) input_options%useQrIsSet
+      endif
+
+      if (command_line_argument(1:10) == "--use-gpu=") then
+        read(command_line_argument(11:), *) input_options%useGPUIsSet
+      endif
+
+      if (command_line_argument(1:8) == "--tests=") then
+        if (command_line_argument(9:11) == "all") then
+          input_options%doSolveTridi=.true.
+          input_options%do1stage=.true.
+          input_options%do2stage=.true.
+          input_options%doCholesky=.true.
+          input_options%doInvertTrm=.true.
+          input_options%doTransposeMultiply=.true.
+        else if (command_line_argument(9:19) == "solve-tride") then
+          input_options%doSolveTridi=.true.
+          input_options%do1stage=.false.
+          input_options%do2stage=.false.
+          input_options%doCholesky=.false.
+          input_options%doInvertTrm=.false.
+          input_options%doTransposeMultiply=.false.
+        else if (command_line_argument(9:14) == "1stage") then
+          input_options%doSolveTridi=.false.
+          input_options%do1stage=.true.
+          input_options%do2stage=.false.
+          input_options%doCholesky=.false.
+          input_options%doInvertTrm=.false.
+          input_options%doTransposeMultiply=.false.
+        else if (command_line_argument(9:14) == "2stage") then
+          input_options%doSolveTridi=.false.
+          input_options%do1stage=.false.
+          input_options%do2stage=.true.
+          input_options%doCholesky=.false.
+          input_options%doInvertTrm=.false.
+          input_options%doTransposeMultiply=.false.
+        else if (command_line_argument(9:16) == "cholesky") then
+          input_options%doSolveTridi=.false.
+          input_options%do1stage=.false.
+          input_options%do2stage=.false.
+          input_options%doCholesky=.true.
+          input_options%doInvertTrm=.false.
+          input_options%doTransposeMultiply=.false.
+        else if (command_line_argument(9:25) == "invert-triangular") then
+          input_options%doSolveTridi=.false.
+          input_options%do1stage=.false.
+          input_options%do2stage=.false.
+          input_options%doCholesky=.false.
+          input_options%doInvertTrm=.true.
+          input_options%doTransposeMultiply=.false.
+        else if (command_line_argument(9:26) == "transpose-multiply") then
+          input_options%doSolveTridi=.false.
+          input_options%do1stage=.false.
+          input_options%do2stage=.false.
+          input_options%doCholesky=.false.
+          input_options%doInvertTrm=.false.
+          input_options%doTransposeMultiply=.true.
+        else
+           print *,"unknown test specified"
+           stop 1
+        endif
+      endif
+
+    end subroutine
+
+    subroutine read_input_parameters_general(input_options)
+      use precision_for_tests
+      implicit none
+
+      type(input_options_t)         :: input_options
+
+      ! Command line arguments
+      character(len=128)            :: arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10
+
+      ! default parameters
+      input_options%datatype = 1
+      input_options%na = 500
+      input_options%nev = 150
+      input_options%nblk = 16
+
+      input_options%write_to_file%eigenvectors = .false.
+      input_options%write_to_file%eigenvalues  = .false.
+
+      input_options%this_real_kernel = ELPA_2STAGE_REAL_DEFAULT
+      input_options%this_complex_kernel = ELPA_2STAGE_COMPLEX_DEFAULT
+      input_options%realKernelIsSet = .false.
+      input_options%complexKernelIsSet = .false.
+
+      input_options%useQrIsSet = 0
+
+      input_options%useGPUIsSet = 0
+
+      input_options%do1Stage = .true.
+      input_options%do2Stage = .true.
+      input_options%doSolveTridi = .true.
+      input_options%doCholesky=.true.
+      input_options%doInvertTrm=.true.
+      input_options%doTransposeMultiply=.true.
+      input_options%justHelpMessage=.false.
+
+      ! test na=1500 nev=50 nblk=16 --help --kernel --output_eigenvectors --output_eigenvalues
+      if (COMMAND_ARGUMENT_COUNT() .gt. 8) then
+        write(error_unit, '(a,i0,a)') "Invalid number (", COMMAND_ARGUMENT_COUNT(), ") of command line arguments!"
+        stop 1
+      endif
+
+      if (COMMAND_ARGUMENT_COUNT() .gt. 0) then
+
+        call get_COMMAND_ARGUMENT(1, arg1)
+
+        call parse_arguments(arg1, input_options)
+
+
+
+        if (COMMAND_ARGUMENT_COUNT() .ge. 2) then
+          ! argument 2
+          call get_COMMAND_ARGUMENT(2, arg2)
+
+          call parse_arguments(arg2, input_options)
+        endif
+
+        ! argument 3
+        if (COMMAND_ARGUMENT_COUNT() .ge. 3) then
+
+          call get_COMMAND_ARGUMENT(3, arg3)
+
+          call parse_arguments(arg3, input_options)
+        endif
+
+        ! argument 4
+        if (COMMAND_ARGUMENT_COUNT() .ge. 4) then
+
+          call get_COMMAND_ARGUMENT(4, arg4)
+
+          call parse_arguments(arg4, input_options)
+
+        endif
+
+        ! argument 5
+        if (COMMAND_ARGUMENT_COUNT() .ge. 5) then
+
+          call get_COMMAND_ARGUMENT(5, arg5)
+
+          call parse_arguments(arg5, input_options)
+        endif
+
+        ! argument 6
+        if (COMMAND_ARGUMENT_COUNT() .ge. 6) then
+
+          call get_COMMAND_ARGUMENT(6, arg6)
+
+          call parse_arguments(arg6, input_options)
+        endif
+
+        ! argument 7
+        if (COMMAND_ARGUMENT_COUNT() .ge. 7) then
+
+          call get_COMMAND_ARGUMENT(7, arg7)
+
+          call parse_arguments(arg7, input_options)
+
+        endif
+
+        ! argument 8
+        if (COMMAND_ARGUMENT_COUNT() .ge. 8) then
+
+          call get_COMMAND_ARGUMENT(8, arg8)
+
+          call parse_arguments(arg8, input_options)
+
+        endif
+
+        ! argument 9
+        if (COMMAND_ARGUMENT_COUNT() .ge. 9) then
+
+          call get_COMMAND_ARGUMENT(9, arg9)
+
+          call parse_arguments(arg8, input_options)
+
+        endif
+
+        ! argument 10
+        if (COMMAND_ARGUMENT_COUNT() .ge. 10) then
+
+          call get_COMMAND_ARGUMENT(10, arg10)
+
+          call parse_arguments(arg8, input_options)
+
+        endif
+
+      endif
+
+      if (input_options%useQrIsSet .eq. 1 .and. input_options%datatype .eq. 2) then
+        print *,"You cannot use QR-decomposition in complex case"
+        stop 1
+      endif
+
+    end subroutine
+
+    subroutine read_input_parameters_traditional_noskip(na, nev, nblk, write_to_file)
+      use precision_for_tests
+      implicit none
+
+      TEST_INT_TYPE, intent(out) :: na, nev, nblk
+
+      type(output_t), intent(out)   :: write_to_file
+      logical                       :: skip_check_correctness
+
+      call read_input_parameters_traditional(na, nev, nblk, write_to_file, skip_check_correctness)
+    end subroutine
+
+    subroutine read_input_parameters_traditional(na, nev, nblk, write_to_file, skip_check_correctness)
+      use precision_for_tests
+      implicit none
+
+      TEST_INT_TYPE, intent(out) :: na, nev, nblk
+
+      type(output_t), intent(out)   :: write_to_file
+      logical, intent(out)          :: skip_check_correctness
+
+      ! Command line arguments
+      character(len=128)            :: arg1, arg2, arg3, arg4, arg5
+
+      ! default parameters
+      na = 5000
+      nev = 150
+      nblk = 16
+      write_to_file%eigenvectors = .false.
+      write_to_file%eigenvalues  = .false.
+      skip_check_correctness = .false.
+
+      if (.not. any(COMMAND_ARGUMENT_COUNT() == [0, 3, 4, 5])) then
+        write(error_unit, '(a,i0,a)') "Invalid number (", COMMAND_ARGUMENT_COUNT(), ") of command line arguments!"
+        write(error_unit, *) "Expected: program [ [matrix_size num_eigenvalues block_size] &
+            ""output_eigenvalues"" ""output_eigenvectors""]"
+        stop 1
+      endif
+
+      if (COMMAND_ARGUMENT_COUNT() == 3) then
+        call GET_COMMAND_ARGUMENT(1, arg1)
+        call GET_COMMAND_ARGUMENT(2, arg2)
+        call GET_COMMAND_ARGUMENT(3, arg3)
+
+        read(arg1, *) na
+        read(arg2, *) nev
+        read(arg3, *) nblk
+      endif
+
+      if (COMMAND_ARGUMENT_COUNT() == 4) then
+        call GET_COMMAND_ARGUMENT(1, arg1)
+        call GET_COMMAND_ARGUMENT(2, arg2)
+        call GET_COMMAND_ARGUMENT(3, arg3)
+        call GET_COMMAND_ARGUMENT(4, arg4)
+        read(arg1, *) na
+        read(arg2, *) nev
+        read(arg3, *) nblk
+
+        if (arg4 .eq. "output_eigenvalues") then
+          write_to_file%eigenvalues = .true.
+        elseif (arg4 .eq. "skip_check_correctness") then
+          skip_check_correctness = .true.
+        else
+          write(error_unit, *) &
+          "Invalid value for parameter 4.  Must be ""output_eigenvalues"", ""skip_check_correctness"" or omitted"
+          stop 1
+        endif
+
+      endif
+
+      if (COMMAND_ARGUMENT_COUNT() == 5) then
+        call GET_COMMAND_ARGUMENT(1, arg1)
+        call GET_COMMAND_ARGUMENT(2, arg2)
+        call GET_COMMAND_ARGUMENT(3, arg3)
+        call GET_COMMAND_ARGUMENT(4, arg4)
+        call GET_COMMAND_ARGUMENT(5, arg5)
+        read(arg1, *) na
+        read(arg2, *) nev
+        read(arg3, *) nblk
+
+        if (arg4 .eq. "output_eigenvalues") then
+          write_to_file%eigenvalues = .true.
+        else
+          write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvalues"" or omitted"
+          stop 1
+        endif
+
+        if (arg5 .eq. "output_eigenvectors") then
+          write_to_file%eigenvectors = .true.
+        else
+          write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvectors"" or omitted"
+          stop 1
+        endif
+
+      endif
+    end subroutine
+
+end module
diff -Nru elpa-2016.05.001/test/shared/test_redir.c elpa-2019.11.001/test/shared/test_redir.c
--- elpa-2016.05.001/test/shared/test_redir.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_redir.c	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,125 @@
+//    This file is part of ELPA.
+//
+//    The ELPA library was originally created by the ELPA consortium,
+//    consisting of the following organizations:
+//
+//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+//      Informatik,
+//    - Technische Universität München, Lehrstuhl für Informatik mit
+//      Schwerpunkt Wissenschaftliches Rechnen ,
+//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+//      and
+//    - IBM Deutschland GmbH
+//
+//
+//    More information can be found here:
+//    http://elpa.mpcdf.mpg.de/
+//
+//    ELPA is free software: you can redistribute it and/or modify
+//    it under the terms of the version 3 of the license of the
+//    GNU Lesser General Public License as published by the Free
+//    Software Foundation.
+//
+//    ELPA is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+//
+//    ELPA reflects a substantial effort on the part of the original
+//    ELPA consortium, and we ask you to respect the spirit of the
+//    license that we chose: i.e., please contribute any changes you
+//    may have back to the original ELPA library distribution, and keep
+//    any derivatives of ELPA under the same license that we chose for
+//    the original distribution, the GNU Lesser General Public License.
+//
+//
+// --------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+
+#define NAME_LENGTH 4096
+#define FILENAME "./mpi_stdout/std%3s_rank%04d.txt"
+
+FILE *tout, *terr;
+void dup_filename(char *filename, int dupfd);
+void dup_fd(int fd, int dupfd);
+
+int _mkdirifnotexists(const char *dir) {
+    struct stat s;
+    if (stat(dir, &s) != 0) {
+        if (errno == ENOENT) {
+            if (mkdir(dir, 0755) != 0) {
+                perror("mkdir");
+                return 0;
+            } else {
+                return 1;
+            }
+        } else {
+            perror("stat()");
+	    return 0;
+        }
+    } else if (!S_ISDIR(s.st_mode)) {
+        fprintf(stderr, "\"%s\" does exist and is not a directory\n", dir);
+        return 0;
+    } else {
+        return 1;
+    }
+}
+
+int create_directories(void) {
+    if (!_mkdirifnotexists("mpi_stdout")) return 0;
+    return 1;
+}
+
+void redirect_stdout(int *myproc) {
+  char buf[NAME_LENGTH];
+
+  if (*myproc == 0) {
+    snprintf(buf, NAME_LENGTH, "tee " FILENAME, "out", *myproc);
+    tout = popen(buf, "w");
+    dup_fd(fileno(tout), 1);
+
+    snprintf(buf, NAME_LENGTH, "tee " FILENAME, "err", *myproc);
+    terr = popen(buf, "w");
+    dup_fd(fileno(terr), 2);
+  } else {
+    snprintf(buf, NAME_LENGTH, FILENAME, "out", *myproc);
+    dup_filename(buf, 1);
+
+    snprintf(buf, NAME_LENGTH, FILENAME, "err", *myproc);
+    dup_filename(buf, 2);
+  }
+
+  return;
+}
+
+/* Redirect file descriptor dupfd to file filename */
+void dup_filename(char *filename, int dupfd) {
+  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+  if(fd < 0) {
+    perror("open()");
+    exit(1);
+  }
+  dup_fd(fd, dupfd);
+}
+
+/* Redirect file descriptor dupfd to file descriptor fd */
+void dup_fd(int fd, int dupfd) {
+  if(dup2(fd,dupfd) < 0) {
+    perror("dup2()");
+    exit(1);
+  }
+}
diff -Nru elpa-2016.05.001/test/shared/test_redirect.F90 elpa-2019.11.001/test/shared/test_redirect.F90
--- elpa-2016.05.001/test/shared/test_redirect.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_redirect.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,116 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+module test_redirect
+  use, intrinsic :: iso_c_binding
+
+  implicit none
+  public
+
+  logical :: use_redirect_stdout
+
+  interface
+    integer(kind=C_INT) function create_directories_c() bind(C, name="create_directories")
+      use, intrinsic :: iso_c_binding
+      implicit none
+    end function
+  end interface
+
+  interface
+    subroutine redirect_stdout_c(myproc) bind(C, name="redirect_stdout")
+      use, intrinsic :: iso_c_binding
+      implicit none
+      integer(kind=C_INT), intent(in) :: myproc
+    end subroutine
+  end interface
+
+  contains
+!>
+!> This function is the Fortran driver for the
+!> C program to create the redirect output
+!> directory
+!>
+!> \param none
+!> \result res integer indicates success or failure
+    function create_directories() result(res)
+      implicit none
+      integer(kind=C_INT) :: res
+      res = int(create_directories_c())
+    end function
+!>
+!> This subroutine is the Fortran driver for the
+!> redirection of stdout and stderr of each MPI
+!> task
+!>
+!> \param myproc MPI task id
+    subroutine redirect_stdout(myproc)
+      use, intrinsic :: iso_c_binding
+      implicit none
+      integer(kind=C_INT), intent(in) :: myproc
+      call redirect_stdout_c(int(myproc, kind=C_INT))
+    end subroutine
+!>
+!> This function checks, whether the environment variable
+!> "REDIRECT_ELPA_TEST_OUTPUT" is set to "true".
+!> Returns ".true." if variable is set, otherwise ".false."
+!> This function only works if the during the build process
+!> "HAVE_ENVIRONMENT_CHECKING" was tested successfully
+!>
+!> \param none
+!> \return logical
+    function check_redirect_environment_variable() result(redirect)
+      implicit none
+      logical            :: redirect
+      character(len=255) :: REDIRECT_VARIABLE
+
+      redirect = .false.
+
+#if defined(HAVE_ENVIRONMENT_CHECKING)
+      call get_environment_variable("REDIRECT_ELPA_TEST_OUTPUT",REDIRECT_VARIABLE)
+#endif
+      if (trim(REDIRECT_VARIABLE) .eq. "true") redirect = .true.
+
+    end function
+
+end module test_redirect
diff -Nru elpa-2016.05.001/test/shared/test_scalapack.F90 elpa-2019.11.001/test/shared/test_scalapack.F90
--- elpa-2016.05.001/test/shared/test_scalapack.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_scalapack.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,111 @@
+! (c) Copyright Pavel Kus, 2017, MPCDF
+!
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+
+#include "../Fortran/assert.h"
+#include "config-f90.h"
+
+module test_scalapack
+  use test_util
+
+  interface solve_scalapack_all
+    module procedure solve_pdsyevd
+    module procedure solve_pzheevd
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure solve_pssyevd
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure solve_pcheevd
+#endif
+  end interface
+
+  interface solve_scalapack_part
+    module procedure solve_pdsyevr
+    module procedure solve_pzheevr
+#ifdef WANT_SINGLE_PRECISION_REAL
+    module procedure solve_pssyevr
+#endif
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+    module procedure solve_pcheevr
+#endif
+  end interface
+
+contains
+
+#define COMPLEXCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_scalapack_template.F90"
+#undef DOUBLE_PRECISION
+#undef COMPLEXCASE
+
+#ifdef WANT_SINGLE_PRECISION_COMPLEX
+
+#define COMPLEXCASE 1
+#define SINGLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_scalapack_template.F90"
+#undef SINGLE_PRECISION
+#undef COMPLEXCASE
+
+#endif /* WANT_SINGLE_PRECISION_COMPLEX */
+
+#define REALCASE 1
+#define DOUBLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_scalapack_template.F90"
+#undef DOUBLE_PRECISION
+#undef REALCASE
+
+#ifdef WANT_SINGLE_PRECISION_REAL
+
+#define REALCASE 1
+#define SINGLE_PRECISION 1
+#include "../../src/general/precision_macros.h"
+#include "test_scalapack_template.F90"
+#undef SINGLE_PRECISION
+#undef REALCASE
+
+#endif /* WANT_SINGLE_PRECISION_REAL */
+
+
+end module
diff -Nru elpa-2016.05.001/test/shared/test_setup_mpi.F90 elpa-2019.11.001/test/shared/test_setup_mpi.F90
--- elpa-2016.05.001/test/shared/test_setup_mpi.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_setup_mpi.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,115 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+
+module test_setup_mpi
+
+  contains
+
+    subroutine setup_mpi(myid, nprocs)
+      use test_util
+      use ELPA_utilities
+      use precision_for_tests
+      implicit none
+
+      TEST_INT_MPI_TYPE              :: mpierr
+
+      TEST_INT_TYPE, intent(out)     :: myid, nprocs
+      TEST_INT_MPI_TYPE              :: myidMPI, nprocsMPI
+#ifdef WITH_OPENMP
+      TEST_INT_MPI_TYPE              :: required_mpi_thread_level, &
+                                        provided_mpi_thread_level
+#endif
+
+
+#ifdef WITH_MPI
+
+#ifndef WITH_OPENMP
+      call mpi_init(mpierr)
+#else
+      required_mpi_thread_level = MPI_THREAD_MULTIPLE
+
+      call mpi_init_thread(required_mpi_thread_level,     &
+                           provided_mpi_thread_level, mpierr)
+
+      if (required_mpi_thread_level .ne. provided_mpi_thread_level) then
+        write(error_unit,*) "MPI ERROR: MPI_THREAD_MULTIPLE is not provided on this system"
+        write(error_unit,*) "           only ", mpi_thread_level_name(provided_mpi_thread_level), " is available"
+        call MPI_FINALIZE(mpierr)
+        call exit(77)
+      endif
+#endif
+      call mpi_comm_rank(mpi_comm_world, myidMPI,  mpierr)
+      call mpi_comm_size(mpi_comm_world, nprocsMPI,mpierr)
+
+      myid   = int(myidMPI,kind=BLAS_KIND)
+      nprocs = int(nprocsMPI,kind=BLAS_KIND)
+
+      if (nprocs <= 1) then
+        print *, "The test programs must be run with more than 1 task to ensure that usage with MPI is actually tested"
+        stop 1
+      endif
+#else
+      myid = 0
+      nprocs = 1
+#endif
+
+    end subroutine
+
+
+end module
diff -Nru elpa-2016.05.001/test/shared/tests_variable_definitions.F90 elpa-2019.11.001/test/shared/tests_variable_definitions.F90
--- elpa-2016.05.001/test/shared/tests_variable_definitions.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/tests_variable_definitions.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,65 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.rzg.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+! This file was written by A. Marek, MPC
+
+#include "config-f90.h"
+module precision_for_tests
+  use iso_c_binding, only : C_FLOAT, C_DOUBLE, C_FLOAT_COMPLEX, C_DOUBLE_COMPLEX, C_INT32_T, C_INT64_T, C_INT
+
+  implicit none
+  integer, parameter :: rk8  = C_DOUBLE
+  integer, parameter :: rk4  = C_FLOAT
+  integer, parameter :: ck8  = C_DOUBLE_COMPLEX
+  integer, parameter :: ck4  = C_FLOAT_COMPLEX
+  integer, parameter :: ik  = C_INT32_T
+  integer, parameter :: lik = C_INT64_T
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+  integer, parameter :: BLAS_KIND = C_INT64_T
+#else
+  integer, parameter :: BLAS_KIND = C_INT32_T
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+  integer, parameter :: MPI_KIND  = C_INT64_T
+#else
+  integer, parameter :: MPI_KIND  = C_INT32_T
+#endif
+end module precision_for_tests
diff -Nru elpa-2016.05.001/test/shared/test_util.F90 elpa-2019.11.001/test/shared/test_util.F90
--- elpa-2016.05.001/test/shared/test_util.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test/shared/test_util.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,156 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+#undef TEST_INT_TYPE
+#undef INT_TYPE
+#undef TEST_INT_MPI_TYPE
+#undef INT_MPI_TYPE
+
+#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
+#define TEST_INT_TYPE integer(kind=c_int64_t)
+#define INT_TYPE c_int64_t
+#else
+#define TEST_INT_TYPE integer(kind=c_int32_t)
+#define INT_TYPE c_int32_t
+#endif
+#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
+#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
+#define INT_MPI_TYPE c_int64_t
+#else
+#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
+#define INT_MPI_TYPE c_int32_t
+#endif
+
+module test_util
+  use iso_c_binding
+  use precision_for_tests
+#ifdef WITH_MPI
+#ifdef HAVE_MPI_MODULE
+  use mpi
+  implicit none
+#else
+  implicit none
+  include 'mpif.h'
+#endif
+#else
+  TEST_INT_MPI_TYPE, parameter :: mpi_comm_world = -1
+#endif
+
+  contains
+!>
+!> This function translates, if ELPA was build with OpenMP support,
+!> the found evel of "thread safetiness" from the internal number
+!> of the MPI library into a human understandable value
+!>
+!> \param level thread-saftiness of the MPI library
+!> \return str human understandable value of thread saftiness
+  pure function mpi_thread_level_name(level) result(str)
+    use, intrinsic :: iso_c_binding
+    implicit none
+    integer(kind=c_int), intent(in) :: level
+    character(len=21)            :: str
+#ifdef WITH_MPI
+    select case(level)
+      case (MPI_THREAD_SINGLE)
+        str = "MPI_THREAD_SINGLE"
+      case (MPI_THREAD_FUNNELED)
+        str = "MPI_THREAD_FUNNELED"
+      case (MPI_THREAD_SERIALIZED)
+        str = "MPI_THREAD_SERIALIZED"
+      case (MPI_THREAD_MULTIPLE)
+        str = "MPI_THREAD_MULTIPLE"
+      case default
+        write(str,'(i0,1x,a)') level, "(Unknown level)"
+    end select
+#endif
+  end function
+
+  function seconds() result(s)
+    integer :: ticks, tick_rate
+    real(kind=c_double) :: s
+
+    call system_clock(count=ticks, count_rate=tick_rate)
+    s = real(ticks, kind=c_double) / tick_rate
+  end function
+
+    subroutine x_a(condition, condition_string, file, line)
+#ifdef HAVE_ISO_FORTRAN_ENV
+      use iso_fortran_env, only : error_unit
+#endif
+      implicit none
+#ifndef HAVE_ISO_FORTRAN_ENV
+      integer, parameter :: error_unit = 0
+#endif
+      logical, intent(in) :: condition
+      character(len=*), intent(in) :: condition_string
+      character(len=*), intent(in) :: file
+      integer, intent(in) :: line
+
+      if (.not. condition) then
+        write(error_unit,'(a,i0)') "Assertion `" // condition_string // "` failed at " // file // ":", line
+        stop 1
+      end if
+    end subroutine
+
+    subroutine x_ao(error_code, error_code_string, file, line)
+      use elpa
+#ifdef HAVE_ISO_FORTRAN_ENV
+      use iso_fortran_env, only : error_unit
+#endif
+      implicit none
+#ifndef HAVE_ISO_FORTRAN_ENV
+      integer, parameter :: error_unit = 0
+#endif
+      integer, intent(in) :: error_code
+      character(len=*), intent(in) :: error_code_string
+      character(len=*), intent(in) :: file
+      integer, intent(in) :: line
+
+      if (error_code /= ELPA_OK) then
+        write(error_unit,'(a,i0)') "Assertion failed: `" // error_code_string // &
+           " is " // elpa_strerr(error_code) // "` at " // file // ":", line
+        stop 1
+      end if
+    end subroutine
+end module
+
diff -Nru elpa-2016.05.001/test/shared_sources/blacs_infrastructure.F90 elpa-2019.11.001/test/shared_sources/blacs_infrastructure.F90
--- elpa-2016.05.001/test/shared_sources/blacs_infrastructure.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/blacs_infrastructure.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,158 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-module mod_blacs_infrastructure
-
-  contains
-
-    subroutine set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, &
-                                np_cols, nprow, npcol, my_prow, my_pcol)
-
-      use precision
-
-      implicit none
-      integer(kind=ik), intent(in)     :: mpi_comm_world
-      integer(kind=ik), intent(inout)  :: my_blacs_ctxt, np_rows, &
-                                          np_cols, nprow, npcol, my_prow, my_pcol
-
-      my_blacs_ctxt = mpi_comm_world
-#ifdef WITH_MPI
-      call BLACS_Gridinit(my_blacs_ctxt, 'C', np_rows, np_cols)
-      call BLACS_Gridinfo(my_blacs_ctxt, nprow, npcol, my_prow, my_pcol)
-#else
-      np_rows = 1
-      np_cols = 1
-      my_prow = 0
-      my_pcol = 0
-#endif
-    end subroutine
-
-    !c> void set_up_blacsgrid_from_fortran(int mpi_comm_world, int* my_blacs_ctxt,
-    !c>                                    int *np_rows, int *np_cols, int *nprow, int *npcol,
-    !c>                                    int *my_prow, int *my_pcol);
-    subroutine set_up_blacsgrid_wrapper(mpi_comm_world, my_blacs_ctxt, np_rows, &
-                                np_cols, nprow, npcol, my_prow, my_pcol)        &
-                                bind(C, name="set_up_blacsgrid_from_fortran")
-      use iso_c_binding
-      implicit none
-      integer(kind=c_int), value :: mpi_comm_world
-      integer(kind=c_int)        :: my_blacs_ctxt, np_rows, &
-                                    np_cols, nprow, npcol, my_prow, my_pcol
-
-      call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, &
-                                np_cols, nprow, npcol, my_prow, my_pcol)
-    end subroutine
-
-    subroutine set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
-                                       np_rows, np_cols, na_rows,  &
-                                       na_cols, sc_desc, my_blacs_ctxt, info)
-
-      use elpa_utilities, only : error_unit
-      use precision
-      use elpa_mpi
-      implicit none
-
-      integer(kind=ik), intent(inout)  :: na, nblk, my_prow, my_pcol, np_rows,   &
-                                          np_cols, na_rows, na_cols, sc_desc(1:9), &
-                                          my_blacs_ctxt, info
-#ifdef WITH_MPI
-      integer, external                :: numroc
-      integer(kind=ik)                 :: mpierr
-
-      ! determine the neccessary size of the distributed matrices,
-      ! we use the scalapack tools routine NUMROC
-
-      na_rows = numroc(na, nblk, my_prow, 0, np_rows)
-      na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
-
-      ! set up the scalapack descriptor for the checks below
-      ! For ELPA the following restrictions hold:
-      ! - block sizes in both directions must be identical (args 4 a. 5)
-      ! - first row and column of the distributed matrix must be on
-      !   row/col 0/0 (arg 6 and 7)
-
-      call descinit(sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info)
-
-      if (info .ne. 0) then
-        write(error_unit,*) 'Error in BLACS descinit! info=',info
-        write(error_unit,*) 'Most likely this happend since you want to use'
-        write(error_unit,*) 'more MPI tasks than are possible for your'
-        write(error_unit,*) 'problem size (matrix size and blocksize)!'
-        write(error_unit,*) 'The blacsgrid can not be set up properly'
-        write(error_unit,*) 'Try reducing the number of MPI tasks...'
-        call MPI_ABORT(mpi_comm_world, 1, mpierr)
-      endif
-#else
-      na_rows = na
-      na_cols = na
-#endif
-    end subroutine
-
-    !c> void set_up_blacs_descriptor_from_fortran(int na, int nblk, int my_prow, int my_pcol,
-    !c>                                           int np_rows, int np_cols,
-    !c>                                           int *na_rows, int *na_cols,
-    !c>                                           int sc_desc[9],
-    !c>                                           int my_blacs_ctxt,
-    !c>                                           int *info);
-    subroutine set_up_blacs_descriptor_wrapper(na, nblk, my_prow, my_pcol, &
-                                               np_rows, np_cols, na_rows,  &
-                                               na_cols, sc_desc,           &
-                                               my_blacs_ctxt, info)        &
-                                               bind(C, name="set_up_blacs_descriptor_from_fortran")
-
-      use iso_c_binding
-      implicit none
-
-
-      integer(kind=c_int), value :: na, nblk, my_prow, my_pcol, np_rows, &
-                                    np_cols, my_blacs_ctxt
-      integer(kind=c_int)        :: na_rows, na_cols, info, sc_desc(1:9)
-
-      call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
-                                   np_rows, np_cols, na_rows,  &
-                                   na_cols, sc_desc, my_blacs_ctxt, info)
-
-
-    end subroutine
-
-end module
diff -Nru elpa-2016.05.001/test/shared_sources/call_elpa1.c elpa-2019.11.001/test/shared_sources/call_elpa1.c
--- elpa-2016.05.001/test/shared_sources/call_elpa1.c	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/call_elpa1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,59 +0,0 @@
-/*     This file is part of ELPA. */
-/*  */
-/*     The ELPA library was originally created by the ELPA consortium, */
-/*     consisting of the following organizations: */
-/*  */
-/*     - Max Planck Computing and Data Facility (MPCDF), formerly known as */
-/*       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), */
-/*     - Bergische Universität Wuppertal, Lehrstuhl für angewandte */
-/*       Informatik, */
-/*     - Technische Universität München, Lehrstuhl für Informatik mit */
-/*       Schwerpunkt Wissenschaftliches Rechnen , */
-/*     - Fritz-Haber-Institut, Berlin, Abt. Theorie, */
-/*     - Max-Plack-Institut für Mathematik in den Naturwissenschaften, */
-/*       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, */
-/*       and */
-/*     - IBM Deutschland GmbH */
-/*  */
-/*  */
-/*     More information can be found here: */
-/*     http://elpa.mpcdf.mpg.de/ */
-/*  */
-/*     ELPA is free software: you can redistribute it and/or modify */
-/*     it under the terms of the version 3 of the license of the */
-/*     GNU Lesser General Public License as published by the Free */
-/*     Software Foundation. */
-/*  */
-/*     ELPA is distributed in the hope that it will be useful, */
-/*     but WITHOUT ANY WARRANTY; without even the implied warranty of */
-/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
-/*     GNU Lesser General Public License for more details. */
-/*  */
-/*     You should have received a copy of the GNU Lesser General Public License */
-/*     along with ELPA.  If not, see <http://www.gnu.org/licenses/> */
-/*  */
-/*     ELPA reflects a substantial effort on the part of the original */
-/*     ELPA consortium, and we ask you to respect the spirit of the */
-/*     license that we chose: i.e., please contribute any changes you */
-/*     may have back to the original ELPA library distribution, and keep */
-/*     any derivatives of ELPA under the same license that we chose for */
-/*     the original distribution, the GNU Lesser General Public License. */
-/*  */
-/*  */
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <elpa/elpa.h>
-#include <complex.h>
-
-int call_elpa1_real_solver_from_c(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int ncols, int mpi_comm_rows, int mpi_comm_cols) {
-  return elpa_solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols);
-}
-
-int call_elpa1_complex_solver_from_c(int na, int nev, complex double *a, int lda, double *ev, complex double *q, int ldq, int nblk, int ncols, int mpi_comm_rows, int mpi_comm_cols) {
-  return elpa_solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, ncols, mpi_comm_rows, mpi_comm_cols);
-}
-
-int call_elpa_get_comm_from_c(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols) {
-  return elpa_get_communicators(mpi_comm_world, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols);
-}
diff -Nru elpa-2016.05.001/test/shared_sources/check_correctnes.F90 elpa-2019.11.001/test/shared_sources/check_correctnes.F90
--- elpa-2016.05.001/test/shared_sources/check_correctnes.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/check_correctnes.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,309 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-
-module mod_check_correctness
-
-
-  interface check_correctness
-    module procedure check_correctness_complex
-    module procedure check_correctness_real
-  end interface
-
-  contains
-
-    function check_correctness_complex(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status)
-
-      use elpa_mpi
-      use precision
-      implicit none
-      integer(kind=ik)                 :: status
-      integer(kind=ik), intent(in)     :: na, nev, myid
-      complex(kind=ck), intent(in)     :: as(:,:), z(:,:)
-      complex(kind=ck), intent(inout)  :: tmp1(:,:), tmp2(:,:)
-      real(kind=rk)                    :: ev(:)
-      complex(kind=ck)                 :: xc
-      integer(kind=ik)                 :: sc_desc(:), mpierr
-      complex(kind=ck), parameter      :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
-      integer(kind=ik)                 :: i
-      real(kind=rk)                    :: err, errmax
-#ifndef WITH_MPI
-      complex(kind=ck)                 :: zdotc
-#endif
-
-      status = 0
-
-      ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
-      ! tmp1 =  A * Z
-      ! as is original stored matrix, Z are the EVs
-#ifdef WITH_MPI
-      call pzgemm('N','N',na,nev,na,CONE,as,1,1,sc_desc, &
-                  z,1,1,sc_desc,CZERO,tmp1,1,1,sc_desc)
-#else
-      call zgemm('N','N',na,nev,na,CONE,as,na,z,na,CZERO,tmp1,na)
-#endif
-      ! tmp2 = Zi*EVi
-      tmp2(:,:) = z(:,:)
-      do i=1,nev
-        xc = ev(i)
-#ifdef WITH_MPI
-        call pzscal(na,xc,tmp2,1,i,sc_desc,1)
-#else
-        call zscal(na,xc,tmp2(1,i),1)
-!        tmp2(1:na,i) = xc*tmp2(1:na,i)
-#endif
-      enddo
-
-      !  tmp1 = A*Zi - Zi*EVi
-      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-      ! Get maximum norm of columns of tmp1
-      errmax = 0.0
-      do i=1,nev
-        xc = 0
-#ifdef WITH_MPI
-        call pzdotc(na,xc,tmp1,1,i,sc_desc,1,tmp1,1,i,sc_desc,1)
-#else
-        ! call zdotc(na,xc,1,tmp1,1)
-        xc = zdotc(na,tmp1,1,tmp1,1)
-#endif
-        errmax = max(errmax, sqrt(real(xc,kind=rk)))
-      enddo
-
-      ! Get maximum error norm over all processors
-      err = errmax
-#ifdef WITH_MPI
-      call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-#else
-      errmax = err
-#endif
-      if (myid==0) print *
-      if (myid==0) print *,'Error Residual     :',errmax
-
-      if (errmax .gt. 5e-12) then
-        status = 1
-      endif
-
-      ! 2. Eigenvector orthogonality
-
-      ! tmp1 = Z**T * Z
-      tmp1 = 0
-#ifdef WITH_MPI
-      call pzgemm('C','N',nev,nev,na,CONE,z,1,1,sc_desc, &
-                  z,1,1,sc_desc,CZERO,tmp1,1,1,sc_desc)
-#else
-      call zgemm('C','N',nev,nev,na,CONE,z,na,z,na,CZERO,tmp1,na)
-#endif
-      ! Initialize tmp2 to unit matrix
-      tmp2 = 0
-#ifdef WITH_MPI
-      call pzlaset('A',nev,nev,CZERO,CONE,tmp2,1,1,sc_desc)
-#else
-      call zlaset('A',nev,nev,CZERO,CONE,tmp2,na)
-#endif
-      ! tmp1 = Z**T * Z - Unit Matrix
-      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-      ! Get maximum error (max abs value in tmp1)
-      err = maxval(abs(tmp1))
-#ifdef WITH_MPI
-      call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-#else
-      errmax = err
-#endif
-      if (myid==0) print *,'Error Orthogonality:',errmax
-
-      if (errmax .gt. 5e-12) then
-        status = 1
-      endif
-    end function
-
-    function check_correctness_real(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status)
-
-      use elpa_mpi
-      use precision
-      implicit none
-      integer(kind=ik)               :: status
-      integer(kind=ik), intent(in)   :: na, nev, myid
-      real(kind=rk), intent(in)      :: as(:,:), z(:,:)
-      real(kind=rk), intent(inout)   :: tmp1(:,:), tmp2(:,:)
-      real(kind=rk)                  :: ev(:)
-      integer(kind=ik)               :: sc_desc(:), mpierr
-
-      integer(kind=ik)               :: i
-      real(kind=rk)                  :: err, errmax
-#ifndef WITH_MPI
-      real(kind=rk)                  :: dnrm2
-#endif
-
-      status = 0
-
-      ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
-      ! tmp1 =  A * Z
-#ifdef WITH_MPI
-      call pdgemm('N','N',na,nev,na,1.d0,as,1,1,sc_desc, &
-                  z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc)
-#else
-      call dgemm('N','N',na,nev,na,1.d0,as,na,z,na,0.d0,tmp1,na)
-#endif
-
-      ! tmp2 = Zi*EVi
-      tmp2(:,:) = z(:,:)
-      do i=1,nev
-#ifdef WITH_MPI
-        call pdscal(na,ev(i),tmp2,1,i,sc_desc,1)
-#else
-        call dscal(na,ev(i),tmp2(:,i),1)
-#endif
-      enddo
-
-      !  tmp1 = A*Zi - Zi*EVi
-      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-      ! Get maximum norm of columns of tmp1
-      errmax = 0.0
-      do i=1,nev
-        err = 0.0
-#ifdef WITH_MPI
-        call pdnrm2(na,err,tmp1,1,i,sc_desc,1)
-#else
-!        call dnrm2(na,err,tmp1,1,i,sc_desc,1)
-        err = dnrm2(na,tmp1(1,i),1)
-#endif
-        errmax = max(errmax, err)
-      enddo
-
-      ! Get maximum error norm over all processors
-      err = errmax
-
-#ifdef WITH_MPI
-      call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-#else
-      errmax = err
-#endif
-      if (myid==0) print *
-      if (myid==0) print *,'Error Residual     :',errmax
-
-      if (errmax .gt. 5e-12) then
-        status = 1
-      endif
-
-      ! 2. Eigenvector orthogonality
-
-      ! tmp1 = Z**T * Z
-      tmp1 = 0
-#ifdef WITH_MPI
-      call pdgemm('T','N',nev,nev,na,1.d0,z,1,1,sc_desc, &
-                  z,1,1,sc_desc,0.d0,tmp1,1,1,sc_desc)
-#else
-      call dgemm('T','N',nev,nev,na,1.d0,z,na, &
-                  z,na,0.d0,tmp1,na)
-#endif
-      ! Initialize tmp2 to unit matrix
-      tmp2 = 0
-#ifdef WITH_MPI
-      call pdlaset('A',nev,nev,0.d0,1.d0,tmp2,1,1,sc_desc)
-#else
-      call dlaset('A',nev,nev,0.d0,1.d0,tmp2,na)
-#endif
-      ! tmp1 = Z**T * Z - Unit Matrix
-      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
-
-      ! Get maximum error (max abs value in tmp1)
-      err = maxval(abs(tmp1))
-#ifdef WITH_MPI
-      call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
-#else
-      errmax = err
-#endif
-      if (myid==0) print *,'Error Orthogonality:',errmax
-
-      if (errmax .gt. 5e-12) then
-        status = 1
-      endif
-    end function
-
-    !c> int check_correctness_real_from_fortran(int na, int nev, int na_rows, int na_cols,
-    !c>                                         double *as, double *z, double *ev,
-    !c>                                         int sc_desc[9], int myid,
-    !c>                                         double *tmp1, double *tmp2);
-    function check_correctness_real_wrapper(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status) &
-      bind(C,name="check_correctness_real_from_fortran")
-
-      use iso_c_binding
-
-      implicit none
-
-      integer(kind=c_int)         :: status
-      integer(kind=c_int), value  :: na, nev, myid, na_rows, na_cols
-      real(kind=c_double)         :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols)
-      real(kind=c_double)         :: tmp1(1:na_rows,1:na_cols), tmp2(1:na_rows,1:na_cols)
-      real(kind=c_double)         :: ev(1:na)
-      integer(kind=c_int)         :: sc_desc(1:9)
-
-      status = check_correctness_real(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-    end function
-    !c> int check_correctness_complex_from_fortran(int na, int nev, int na_rows, int na_cols,
-    !c>                                         complex double *as, complex double *z, double *ev,
-    !c>                                         int sc_desc[9], int myid,
-    !c>                                         complex double *tmp1, complex double *tmp2);
-    function check_correctness_complex_wrapper(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid, tmp1, tmp2) result(status) &
-      bind(C,name="check_correctness_complex_from_fortran")
-
-      use iso_c_binding
-
-      implicit none
-
-      integer(kind=c_int)         :: status
-      integer(kind=c_int), value  :: na, nev, myid, na_rows, na_cols
-      complex(kind=c_double)      :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols)
-      complex(kind=c_double)      :: tmp1(1:na_rows,1:na_cols), tmp2(1:na_rows,1:na_cols)
-      real(kind=c_double)         :: ev(1:na)
-      integer(kind=c_int)         :: sc_desc(1:9)
-
-      status = check_correctness_complex(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
-
-    end function
-
-end module mod_check_correctness
diff -Nru elpa-2016.05.001/test/shared_sources/mod_from_c.F90 elpa-2019.11.001/test/shared_sources/mod_from_c.F90
--- elpa-2016.05.001/test/shared_sources/mod_from_c.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/mod_from_c.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,160 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-module from_c
-  implicit none
-
-  public
-
-  interface
-    integer(kind=c_int) function elpa1_real_c(na, nev,  a, lda, ev, q, ldq,         &
-                                       nblk, matrixCols, mpi_comm_rows, mpi_comm_cols ) &
-                                       bind(C, name="call_elpa1_real_solver_from_c")
-
-      use iso_c_binding
-      implicit none
-
-      integer(kind=c_int), value :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=c_double)        :: a(1:lda,1:matrixCOls), ev(1:na), q(1:ldq,1:matrixCols)
-    end function elpa1_real_c
-
-
-  end interface
-
-  interface
-    integer(kind=c_int) function elpa1_complex_c(na, nev,  a, lda, ev, q, ldq,         &
-                                       nblk, matrixCols, mpi_comm_rows, mpi_comm_cols ) &
-                                       bind(C, name="call_elpa1_complex_solver_from_c")
-
-      use iso_c_binding
-      implicit none
-
-      integer(kind=c_int), value  :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-      real(kind=c_double)         :: ev(1:na)
-      complex(kind=c_double)      :: a(1:lda,1:matrixCOls), q(1:ldq,1:matrixCols)
-
-    end function elpa1_complex_c
-
-
-  end interface
-
-  interface
-    integer(kind=c_int) function elpa_get_comm_c(mpi_comm_world, my_prow, my_pcol, &
-                                                 mpi_comm_rows, mpi_comm_cols)     &
-                                                 bind(C, name="call_elpa_get_comm_from_c")
-      use iso_c_binding
-      implicit none
-      integer(kind=c_int), value :: mpi_comm_world, my_prow, my_pcol
-      integer(kind=c_int)        :: mpi_comm_rows, mpi_comm_cols
-
-    end function
-  end interface
-
-  contains
-
-  function solve_elpa1_real_call_from_c(na, nev, a, lda, ev, q, ldq,         &
-                      nblk, matrixCOls, mpi_comm_rows, mpi_comm_cols ) &
-                      result(success)
-    use precision
-    use iso_c_binding
-    implicit none
-
-    integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-    logical          :: success
-    integer(kind=ik) :: successC
-
-    real(kind=c_double)  :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols)
-
-    successC = elpa1_real_c(na, nev, a, lda, ev, q, ldq, nblk, &
-                            matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    if (successC .eq. 1) then
-      success = .true.
-    else
-      success = .false.
-    endif
-
-  end function
-
-  function solve_elpa1_complex_call_from_c(na, nev, a, lda, ev, q, ldq,         &
-                      nblk, matrixCOls, mpi_comm_rows, mpi_comm_cols ) &
-                      result(success)
-
-    use precision
-    use iso_c_binding
-    implicit none
-
-    integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
-    logical          :: success
-    integer(kind=ik) :: successC
-
-    real(kind=c_double)    :: ev(1:na)
-    complex(kind=c_double) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols)
-
-
-    successC = elpa1_complex_c(na, nev, a, lda, ev, q, ldq, nblk, &
-                            matrixCols, mpi_comm_rows, mpi_comm_cols)
-
-    if (successC .eq. 1) then
-      success = .true.
-    else
-      success = .false.
-    endif
-
-  end function
-
-
-  function call_elpa_get_comm_from_c(mpi_comm_world, my_prow, my_pcol, &
-                                     mpi_comm_rows, mpi_comm_cols) result(mpierr)
-
-      use precision
-      use iso_c_binding
-      implicit none
-
-      integer(kind=ik) :: mpierr
-      integer(kind=ik) :: mpi_comm_world, my_prow, my_pcol, &
-                          mpi_comm_rows, mpi_comm_cols
-
-      mpierr = elpa_get_comm_c(mpi_comm_world, my_prow, my_pcol, &
-                                    mpi_comm_rows, mpi_comm_cols)
-  end function
-end module from_c
diff -Nru elpa-2016.05.001/test/shared_sources/mod_output_types.F90 elpa-2019.11.001/test/shared_sources/mod_output_types.F90
--- elpa-2016.05.001/test/shared_sources/mod_output_types.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/mod_output_types.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,11 +0,0 @@
-#include "config-f90.h"
-
-module output_types
-
- type :: output_t
-   logical :: eigenvectors
-   logical :: eigenvalues
- end type
-
-
-end module
diff -Nru elpa-2016.05.001/test/shared_sources/prepare_matrix.F90 elpa-2019.11.001/test/shared_sources/prepare_matrix.F90
--- elpa-2016.05.001/test/shared_sources/prepare_matrix.F90	2016-05-20 05:09:52.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/prepare_matrix.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,173 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-module mod_prepare_matrix
-
-  interface prepare_matrix
-    module procedure prepare_matrix_complex
-    module procedure prepare_matrix_real
-  end interface
-
-  contains
-
-    subroutine prepare_matrix_complex(na, myid, sc_desc, iseed, xr, a, z, as)
-
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)    :: myid, na, sc_desc(:)
-      integer(kind=ik), intent(inout) :: iseed(:)
-      real(kind=rk), intent(inout)    :: xr(:,:)
-      complex(kind=ck), intent(inout) :: z(:,:), a(:,:), as(:,:)
-
-      complex(kind=ck), parameter     :: CZERO = (0.d0, 0.d0), CONE = (1.d0, 0.d0)
-
-      ! for getting a hermitian test matrix A we get a random matrix Z
-      ! and calculate A = Z + Z**H
-
-      ! we want different random numbers on every process
-      ! (otherwise A might get rank deficient):
-
-      iseed(:) = myid
-      call RANDOM_SEED(put=iseed)
-      call RANDOM_NUMBER(xr)
-      z(:,:) = xr(:,:)
-      call RANDOM_NUMBER(xr)
-      z(:,:) = z(:,:) + (0.d0,1.d0)*xr(:,:)
-
-      a(:,:) = z(:,:)
-
-      if (myid == 0) then
-        print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
-      endif
-#ifdef WITH_MPI
-      call pztranc(na, na, CONE, z, 1, 1, sc_desc, CONE, a, 1, 1, sc_desc) ! A = A + Z**H
-#else
-      a = a + transpose(conjg(z))
-#endif
-      if (myid == 0) then
-        print '(a)','| Random matrix block has been symmetrized'
-      endif
-
-      ! save original matrix A for later accuracy checks
-
-      as = a
-
-    end subroutine
-
-    subroutine prepare_matrix_real(na, myid, sc_desc, iseed, a, z, as)
-
-      use precision
-      implicit none
-
-      integer(kind=ik), intent(in)     :: myid, na, sc_desc(:)
-      integer(kind=ik), intent(inout)  :: iseed(:)
-      real(kind=ck), intent(inout)     :: z(:,:), a(:,:), as(:,:)
-
-      ! for getting a hermitian test matrix A we get a random matrix Z
-      ! and calculate A = Z + Z**H
-
-      ! we want different random numbers on every process
-      ! (otherwise A might get rank deficient):
-
-      iseed(:) = myid
-      call RANDOM_SEED(put=iseed)
-      call RANDOM_NUMBER(z)
-
-      a(:,:) = z(:,:)
-
-      if (myid == 0) then
-        print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
-      endif
-#ifdef WITH_MPI
-      call pdtran(na, na, 1.d0, z, 1, 1, sc_desc, 1.d0, a, 1, 1, sc_desc) ! A = A + Z**T
-#else
-      a = a + transpose(z)
-#endif
-      if (myid == 0) then
-        print '(a)','| Random matrix block has been symmetrized'
-      endif
-
-      ! save original matrix A for later accuracy checks
-
-      as = a
-
-    end subroutine
-
-    !c> void prepare_matrix_real_from_fortran(int na, int myid, int na_rows, int na_cols,
-    !c>                                       int sc_desc[9], int iseed[4096],
-    !c>                                       double *a, double *z, double *as);
-    subroutine prepare_matrix_real_wrapper(na, myid, na_rows, na_cols, sc_desc, iseed, a, z, as) &
-                                          bind(C, name="prepare_matrix_real_from_fortran")
-      use iso_c_binding
-
-      implicit none
-
-      integer(kind=c_int) , value   :: myid, na, na_rows, na_cols
-      integer(kind=c_int)           :: sc_desc(1:9)
-      integer(kind=c_int)           :: iseed(1:4096)
-      real(kind=c_double)           :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols),  &
-                                       as(1:na_rows,1:na_cols)
-
-      call prepare_matrix_real(na, myid, sc_desc, iseed, a, z, as)
-    end subroutine
-    !c> void prepare_matrix_complex_from_fortran(int na, int myid, int na_rows, int na_cols,
-    !c>                                       int sc_desc[9], int iseed[4096], double *xr,
-    !c>                                       complex double *a, complex double *z, complex double *as);
-    subroutine prepare_matrix_complex_wrapper(na, myid, na_rows, na_cols, sc_desc, iseed, xr, a, z, as) &
-                                          bind(C, name="prepare_matrix_complex_from_fortran")
-      use iso_c_binding
-
-      implicit none
-
-      integer(kind=c_int) , value   :: myid, na, na_rows, na_cols
-      integer(kind=c_int)           :: sc_desc(1:9)
-      integer(kind=c_int)           :: iseed(1:4096)
-      real(kind=c_double)           :: xr(1:na_rows,1:na_cols)
-      complex(kind=c_double)        :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols),  &
-                                       as(1:na_rows,1:na_cols)
-
-      call prepare_matrix_complex(na, myid, sc_desc, iseed, xr, a, z, as)
-    end subroutine
-
-end module
diff -Nru elpa-2016.05.001/test/shared_sources/read_input_parameters.F90 elpa-2019.11.001/test/shared_sources/read_input_parameters.F90
--- elpa-2016.05.001/test/shared_sources/read_input_parameters.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/read_input_parameters.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,132 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-module mod_read_input_parameters
-
-  contains
-
-    subroutine read_input_parameters(na, nev, nblk, write_to_file)
-      use ELPA_utilities, only : error_unit
-      use precision
-      use elpa_mpi
-      use output_types
-      implicit none
-
-      integer(kind=ik), intent(out) :: na, nev, nblk
-
-      type(output_t), intent(out)   :: write_to_file
-
-      ! Command line arguments
-      character(len=128)            :: arg1, arg2, arg3, arg4, arg5
-      integer(kind=ik)              :: mpierr
-
-      ! default parameters
-      na = 4000
-      nev = 1500
-      nblk = 16
-      write_to_file%eigenvectors = .false.
-      write_to_file%eigenvalues  = .false.
-
-      if (.not. any(COMMAND_ARGUMENT_COUNT() == [0, 3, 4, 5])) then
-        write(error_unit, '(a,i0,a)') "Invalid number (", COMMAND_ARGUMENT_COUNT(), ") of command line arguments!"
-        write(error_unit, *) "Expected: program [ [matrix_size num_eigenvalues block_size] &
-            ""output_eigenvalues"" ""output_eigenvectors""]"
-        stop 1
-      endif
-
-      if (COMMAND_ARGUMENT_COUNT() == 3) then
-        call GET_COMMAND_ARGUMENT(1, arg1)
-        call GET_COMMAND_ARGUMENT(2, arg2)
-        call GET_COMMAND_ARGUMENT(3, arg3)
-
-        read(arg1, *) na
-        read(arg2, *) nev
-        read(arg3, *) nblk
-      endif
-
-      if (COMMAND_ARGUMENT_COUNT() == 4) then
-        call GET_COMMAND_ARGUMENT(1, arg1)
-        call GET_COMMAND_ARGUMENT(2, arg2)
-        call GET_COMMAND_ARGUMENT(3, arg3)
-        call GET_COMMAND_ARGUMENT(4, arg4)
-        read(arg1, *) na
-        read(arg2, *) nev
-        read(arg3, *) nblk
-
-        if (arg4 .eq. "output_eigenvalues") then
-          write_to_file%eigenvalues = .true.
-        else
-          write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvalues"" or omitted"
-          stop 1
-        endif
-
-      endif
-
-      if (COMMAND_ARGUMENT_COUNT() == 5) then
-        call GET_COMMAND_ARGUMENT(1, arg1)
-        call GET_COMMAND_ARGUMENT(2, arg2)
-        call GET_COMMAND_ARGUMENT(3, arg3)
-        call GET_COMMAND_ARGUMENT(4, arg4)
-        call GET_COMMAND_ARGUMENT(5, arg5)
-        read(arg1, *) na
-        read(arg2, *) nev
-        read(arg3, *) nblk
-
-        if (arg4 .eq. "output_eigenvalues") then
-          write_to_file%eigenvalues = .true.
-        else
-          write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvalues"" or omitted"
-          stop 1
-        endif
-
-        if (arg5 .eq. "output_eigenvectors") then
-          write_to_file%eigenvectors = .true.
-        else
-          write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvectors"" or omitted"
-          stop 1
-        endif
-
-      endif
-    end subroutine
-
-end module
diff -Nru elpa-2016.05.001/test/shared_sources/redir.c elpa-2019.11.001/test/shared_sources/redir.c
--- elpa-2016.05.001/test/shared_sources/redir.c	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/redir.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,125 +0,0 @@
-//    This file is part of ELPA.
-//
-//    The ELPA library was originally created by the ELPA consortium,
-//    consisting of the following organizations:
-//
-//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-//      Informatik,
-//    - Technische Universität München, Lehrstuhl für Informatik mit
-//      Schwerpunkt Wissenschaftliches Rechnen ,
-//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-//      and
-//    - IBM Deutschland GmbH
-//
-//
-//    More information can be found here:
-//    http://elpa.mpcdf.mpg.de/
-//
-//    ELPA is free software: you can redistribute it and/or modify
-//    it under the terms of the version 3 of the license of the
-//    GNU Lesser General Public License as published by the Free
-//    Software Foundation.
-//
-//    ELPA is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU Lesser General Public License for more details.
-//
-//    You should have received a copy of the GNU Lesser General Public License
-//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-//
-//    ELPA reflects a substantial effort on the part of the original
-//    ELPA consortium, and we ask you to respect the spirit of the
-//    license that we chose: i.e., please contribute any changes you
-//    may have back to the original ELPA library distribution, and keep
-//    any derivatives of ELPA under the same license that we chose for
-//    the original distribution, the GNU Lesser General Public License.
-//
-//
-// --------------------------------------------------------------------------------------------------
-#include <stdio.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <errno.h>
-
-#define NAME_LENGTH 4096
-#define FILENAME "./mpi_stdout/std%3s_rank%04d.txt"
-
-FILE *tout, *terr;
-void dup_filename(char *filename, int dupfd);
-void dup_fd(int fd, int dupfd);
-
-int _mkdirifnotexists(const char *dir) {
-    struct stat s;
-    if (stat(dir, &s) != 0) {
-        if (errno == ENOENT) {
-            if (mkdir(dir, 0755) != 0) {
-                perror("mkdir");
-                return 0;
-            } else {
-                return 1;
-            }
-        } else {
-            perror("stat()");
-	    return 0;
-        }
-    } else if (!S_ISDIR(s.st_mode)) {
-        fprintf(stderr, "\"%s\" does exist and is not a directory\n", dir);
-        return 0;
-    } else {
-        return 1;
-    }
-}
-
-int create_directories(void) {
-    if (!_mkdirifnotexists("mpi_stdout")) return 0;
-    return 1;
-}
-
-void redirect_stdout(int *myproc) {
-  char buf[NAME_LENGTH];
-
-  if (*myproc == 0) {
-    snprintf(buf, NAME_LENGTH, "tee " FILENAME, "out", *myproc);
-    tout = popen(buf, "w");
-    dup_fd(fileno(tout), 1);
-
-    snprintf(buf, NAME_LENGTH, "tee " FILENAME, "err", *myproc);
-    terr = popen(buf, "w");
-    dup_fd(fileno(terr), 2);
-  } else {
-    snprintf(buf, NAME_LENGTH, FILENAME, "out", *myproc);
-    dup_filename(buf, 1);
-
-    snprintf(buf, NAME_LENGTH, FILENAME, "err", *myproc);
-    dup_filename(buf, 2);
-  }
-
-  return;
-}
-
-/* Redirect file descriptor dupfd to file filename */
-void dup_filename(char *filename, int dupfd) {
-  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-  if(fd < 0) {
-    perror("open()");
-    exit(1);
-  }
-  dup_fd(fd, dupfd);
-}
-
-/* Redirect file descriptor dupfd to file descriptor fd */
-void dup_fd(int fd, int dupfd) {
-  if(dup2(fd,dupfd) < 0) {
-    perror("dup2()");
-    exit(1);
-  }
-}
diff -Nru elpa-2016.05.001/test/shared_sources/redirect.F90 elpa-2019.11.001/test/shared_sources/redirect.F90
--- elpa-2016.05.001/test/shared_sources/redirect.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/redirect.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,118 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-
-module redirect
-  use, intrinsic :: iso_c_binding
-
-  implicit none
-  public
-
-  logical :: use_redirect_stdout
-
-  interface
-    integer(kind=C_INT) function create_directories_c() bind(C, name="create_directories")
-      use, intrinsic :: iso_c_binding
-      implicit none
-    end function
-  end interface
-
-  interface
-    subroutine redirect_stdout_c(myproc) bind(C, name="redirect_stdout")
-      use, intrinsic :: iso_c_binding
-      implicit none
-      integer(kind=C_INT), intent(in) :: myproc
-    end subroutine
-  end interface
-
-  contains
-!>
-!> This function is the Fortran driver for the
-!> C program to create the redirect output
-!> directory
-!>
-!> \param none
-!> \result res integer indicates success or failure
-    function create_directories() result(res)
-      use precision
-      implicit none
-      integer(kind=ik) :: res
-      res = int(create_directories_c())
-    end function
-!>
-!> This subroutine is the Fortran driver for the
-!> redirection of stdout and stderr of each MPI
-!> task
-!>
-!> \param myproc MPI task id
-    subroutine redirect_stdout(myproc)
-      use, intrinsic :: iso_c_binding
-      use precision
-      implicit none
-      integer(kind=ik), intent(in) :: myproc
-      call redirect_stdout_c(int(myproc, kind=C_INT))
-    end subroutine
-!>
-!> This function checks, whether the environment variable
-!> "REDIRECT_ELPA_TEST_OUTPUT" is set to "true".
-!> Returns ".true." if variable is set, otherwise ".false."
-!> This function only works if the during the build process
-!> "HAVE_ENVIRONMENT_CHECKING" was tested successfully
-!>
-!> \param none
-!> \return logical
-    function check_redirect_environment_variable() result(redirect)
-      implicit none
-      logical            :: redirect
-      character(len=255) :: REDIRECT_VARIABLE
-
-      redirect = .false.
-
-#if defined(HAVE_ENVIRONMENT_CHECKING)
-      call get_environment_variable("REDIRECT_ELPA_TEST_OUTPUT",REDIRECT_VARIABLE)
-#endif
-      if (trim(REDIRECT_VARIABLE) .eq. "true") redirect = .true.
-
-    end function
-
-end module redirect
diff -Nru elpa-2016.05.001/test/shared_sources/setup_mpi.F90 elpa-2019.11.001/test/shared_sources/setup_mpi.F90
--- elpa-2016.05.001/test/shared_sources/setup_mpi.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/setup_mpi.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,87 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-module mod_setup_mpi
-
-  contains
-
-    subroutine setup_mpi(myid, nprocs)
-      use test_util
-      use ELPA_utilities
-      use precision
-      use elpa_mpi
-      implicit none
-
-      integer(kind=ik)              :: mpierr
-
-      integer(kind=ik), intent(out) :: myid, nprocs
-#ifdef WITH_OPENMP
-      integer(kind=ik)              :: required_mpi_thread_level, &
-                                       provided_mpi_thread_level
-
-#endif
-#ifdef WITH_MPI
-
-#ifndef WITH_OPENMP
-      call mpi_init(mpierr)
-#else
-      required_mpi_thread_level = MPI_THREAD_MULTIPLE
-
-      call mpi_init_thread(required_mpi_thread_level,     &
-                           provided_mpi_thread_level, mpierr)
-
-      if (required_mpi_thread_level .ne. provided_mpi_thread_level) then
-        write(error_unit,*) "MPI ERROR: MPI_THREAD_MULTIPLE is not provided on this system"
-        write(error_unit,*) "           only ", mpi_thread_level_name(provided_mpi_thread_level), " is available"
-        call exit(77)
-      endif
-
-#endif
-#endif /* WITH_MPI */
-      call mpi_comm_rank(mpi_comm_world,myid,mpierr)
-      call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
-
-    end subroutine
-
-
-end module mod_setup_mpi
diff -Nru elpa-2016.05.001/test/shared_sources/util.F90 elpa-2019.11.001/test/shared_sources/util.F90
--- elpa-2016.05.001/test/shared_sources/util.F90	2016-05-19 18:28:03.000000000 +0000
+++ elpa-2019.11.001/test/shared_sources/util.F90	1970-01-01 00:00:00.000000000 +0000
@@ -1,79 +0,0 @@
-!    This file is part of ELPA.
-!
-!    The ELPA library was originally created by the ELPA consortium,
-!    consisting of the following organizations:
-!
-!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
-!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
-!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
-!      Informatik,
-!    - Technische Universität München, Lehrstuhl für Informatik mit
-!      Schwerpunkt Wissenschaftliches Rechnen ,
-!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
-!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
-!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
-!      and
-!    - IBM Deutschland GmbH
-!
-!
-!    More information can be found here:
-!    http://elpa.mpcdf.mpg.de/
-!
-!    ELPA is free software: you can redistribute it and/or modify
-!    it under the terms of the version 3 of the license of the
-!    GNU Lesser General Public License as published by the Free
-!    Software Foundation.
-!
-!    ELPA is distributed in the hope that it will be useful,
-!    but WITHOUT ANY WARRANTY; without even the implied warranty of
-!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-!    GNU Lesser General Public License for more details.
-!
-!    You should have received a copy of the GNU Lesser General Public License
-!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
-!
-!    ELPA reflects a substantial effort on the part of the original
-!    ELPA consortium, and we ask you to respect the spirit of the
-!    license that we chose: i.e., please contribute any changes you
-!    may have back to the original ELPA library distribution, and keep
-!    any derivatives of ELPA under the same license that we chose for
-!    the original distribution, the GNU Lesser General Public License.
-!
-!
-#include "config-f90.h"
-module test_util
-  use elpa_mpi
-  implicit none
-  private
-  public mpi_thread_level_name
-
-  contains
-!>
-!> This function translates, if ELPA was build with OpenMP support,
-!> the found evel of "thread safetiness" from the internal number
-!> of the MPI library into a human understandable value
-!>
-!> \param level thread-saftiness of the MPI library
-!> \return str human understandable value of thread saftiness
-  pure function mpi_thread_level_name(level) result(str)
-    use precision
-    implicit none
-    integer(kind=ik), intent(in) :: level
-    character(len=21)            :: str
-#ifdef WITH_MPI
-    select case(level)
-      case (MPI_THREAD_SINGLE)
-        str = "MPI_THREAD_SINGLE"
-      case (MPI_THREAD_FUNNELED)
-        str = "MPI_THREAD_FUNNELED"
-      case (MPI_THREAD_SERIALIZED)
-        str = "MPI_THREAD_SERIALIZED"
-      case (MPI_THREAD_MULTIPLE)
-        str = "MPI_THREAD_MULTIPLE"
-      case default
-        write(str,'(i0,1x,a)') level, "(Unknown level)"
-    end select
-#endif
-  end function
-
-end module
diff -Nru elpa-2016.05.001/test-driver elpa-2019.11.001/test-driver
--- elpa-2016.05.001/test-driver	2016-05-20 07:04:37.000000000 +0000
+++ elpa-2019.11.001/test-driver	2019-12-21 16:29:48.000000000 +0000
@@ -1,9 +1,9 @@
 #! /bin/sh
 # test-driver - basic testsuite driver script.
 
-scriptversion=2013-07-13.22; # UTC
+scriptversion=2018-03-07.03; # UTC
 
-# Copyright (C) 2011-2014 Free Software Foundation, Inc.
+# Copyright (C) 2011-2018 Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -16,7 +16,7 @@
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -56,21 +56,26 @@
 expect_failure=no
 color_tests=no
 enable_hard_errors=yes
-while test $# -gt 0; do
-  case $1 in
+while test $# -gt 1; do
+ arg=${1%=*}
+ val=${1#*=}
+ if [ $arg = $val ]; then
+   val=$2
+   shift
+ fi
+ case $arg in
   --help) print_usage; exit $?;;
   --version) echo "test-driver $scriptversion"; exit $?;;
-  --test-name) test_name=$2; shift;;
-  --log-file) log_file=$2; shift;;
-  --trs-file) trs_file=$2; shift;;
-  --color-tests) color_tests=$2; shift;;
-  --expect-failure) expect_failure=$2; shift;;
-  --enable-hard-errors) enable_hard_errors=$2; shift;;
-  --) shift; break;;
+  --test-name) test_name=$val;;
+  --log-file) log_file=$val;;
+  --trs-file) trs_file=$val;;
+  --color-tests) color_tests=$val;;
+  --expect-failure) expect_failure=$val;;
+  --enable-hard-errors) enable_hard_errors=$val;;
+  --) break;;
   -*) usage_error "invalid option: '$1'";;
-   *) break;;
   esac
-  shift
+  [ $arg != $val ] && shift
 done
 
 missing_opts=
@@ -140,9 +145,9 @@
 # Local Variables:
 # mode: shell-script
 # sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "scriptversion="
 # time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
+# time-stamp-time-zone: "UTC0"
 # time-stamp-end: "; # UTC"
 # End:
diff -Nru elpa-2016.05.001/test_programs.am elpa-2019.11.001/test_programs.am
--- elpa-2016.05.001/test_programs.am	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_programs.am	2019-12-21 16:29:37.000000000 +0000
@@ -0,0 +1,7065 @@
+if ENABLE_C_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_double_eigenvectors_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_complex_double_eigenvectors_1stage_random
+endif
+check_SCRIPTS += validate_c_version_complex_double_eigenvectors_1stage_random_default.sh
+validate_c_version_complex_double_eigenvectors_1stage_random_SOURCES = test/C/test.c
+validate_c_version_complex_double_eigenvectors_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_double_eigenvectors_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_double_eigenvectors_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if ENABLE_C_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random
+else
+noinst_PROGRAMS += validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random
+endif
+check_SCRIPTS += validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_default.sh
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES = test/C/test.c
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_double_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+
+if ENABLE_C_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_double_eigenvectors_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_real_double_eigenvectors_1stage_random
+endif
+check_SCRIPTS += validate_c_version_real_double_eigenvectors_1stage_random_default.sh
+validate_c_version_real_double_eigenvectors_1stage_random_SOURCES = test/C/test.c
+validate_c_version_real_double_eigenvectors_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_double_eigenvectors_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_double_eigenvectors_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if ENABLE_C_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_double_eigenvectors_2stage_default_kernel_random
+else
+noinst_PROGRAMS += validate_c_version_real_double_eigenvectors_2stage_default_kernel_random
+endif
+check_SCRIPTS += validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_default.sh
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_SOURCES = test/C/test.c
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_double_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if ENABLE_C_TESTS
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_single_eigenvectors_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_complex_single_eigenvectors_1stage_random
+endif
+check_SCRIPTS += validate_c_version_complex_single_eigenvectors_1stage_random_default.sh
+validate_c_version_complex_single_eigenvectors_1stage_random_SOURCES = test/C/test.c
+validate_c_version_complex_single_eigenvectors_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_single_eigenvectors_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_single_eigenvectors_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random
+else
+noinst_PROGRAMS += validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random
+endif
+check_SCRIPTS += validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_default.sh
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES = test/C/test.c
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_single_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+endif
+
+if ENABLE_C_TESTS
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_single_eigenvectors_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_real_single_eigenvectors_1stage_random
+endif
+check_SCRIPTS += validate_c_version_real_single_eigenvectors_1stage_random_default.sh
+validate_c_version_real_single_eigenvectors_1stage_random_SOURCES = test/C/test.c
+validate_c_version_real_single_eigenvectors_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_single_eigenvectors_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_single_eigenvectors_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_single_eigenvectors_2stage_default_kernel_random
+else
+noinst_PROGRAMS += validate_c_version_real_single_eigenvectors_2stage_default_kernel_random
+endif
+check_SCRIPTS += validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_default.sh
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_SOURCES = test/C/test.c
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_single_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+endif
+
+if ENABLE_C_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_double_generalized_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_complex_double_generalized_1stage_random
+endif
+check_SCRIPTS += validate_c_version_complex_double_generalized_1stage_random_default.sh
+validate_c_version_complex_double_generalized_1stage_random_SOURCES = test/C/test.c
+validate_c_version_complex_double_generalized_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_double_generalized_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_double_generalized_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if ENABLE_C_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_double_generalized_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_real_double_generalized_1stage_random
+endif
+check_SCRIPTS += validate_c_version_real_double_generalized_1stage_random_default.sh
+validate_c_version_real_double_generalized_1stage_random_SOURCES = test/C/test.c
+validate_c_version_real_double_generalized_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_double_generalized_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_double_generalized_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if ENABLE_C_TESTS
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_single_generalized_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_complex_single_generalized_1stage_random
+endif
+check_SCRIPTS += validate_c_version_complex_single_generalized_1stage_random_default.sh
+validate_c_version_complex_single_generalized_1stage_random_SOURCES = test/C/test.c
+validate_c_version_complex_single_generalized_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_single_generalized_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_single_generalized_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_single_generalized_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_real_single_generalized_1stage_random
+endif
+check_SCRIPTS += validate_c_version_real_single_generalized_1stage_random_default.sh
+validate_c_version_real_single_generalized_1stage_random_SOURCES = test/C/test.c
+validate_c_version_real_single_generalized_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_single_generalized_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_single_generalized_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_double_generalized_decomp_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_complex_double_generalized_decomp_1stage_random
+endif
+check_SCRIPTS += validate_c_version_complex_double_generalized_decomp_1stage_random_default.sh
+validate_c_version_complex_double_generalized_decomp_1stage_random_SOURCES = test/C/test.c
+validate_c_version_complex_double_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_double_generalized_decomp_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_double_generalized_decomp_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if ENABLE_C_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_double_generalized_decomp_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_real_double_generalized_decomp_1stage_random
+endif
+check_SCRIPTS += validate_c_version_real_double_generalized_decomp_1stage_random_default.sh
+validate_c_version_real_double_generalized_decomp_1stage_random_SOURCES = test/C/test.c
+validate_c_version_real_double_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_double_generalized_decomp_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_double_generalized_decomp_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if ENABLE_C_TESTS
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_single_generalized_decomp_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_complex_single_generalized_decomp_1stage_random
+endif
+check_SCRIPTS += validate_c_version_complex_single_generalized_decomp_1stage_random_default.sh
+validate_c_version_complex_single_generalized_decomp_1stage_random_SOURCES = test/C/test.c
+validate_c_version_complex_single_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_single_generalized_decomp_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_single_generalized_decomp_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_single_generalized_decomp_1stage_random
+else
+noinst_PROGRAMS += validate_c_version_real_single_generalized_decomp_1stage_random
+endif
+check_SCRIPTS += validate_c_version_real_single_generalized_decomp_1stage_random_default.sh
+validate_c_version_real_single_generalized_decomp_1stage_random_SOURCES = test/C/test.c
+validate_c_version_real_single_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_single_generalized_decomp_1stage_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_single_generalized_decomp_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_double_eigenvectors_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_complex_double_eigenvectors_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_complex_double_eigenvectors_1stage_gpu_random_default.sh
+validate_c_version_complex_double_eigenvectors_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_complex_double_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_double_eigenvectors_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_double_eigenvectors_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random
+endif
+check_SCRIPTS += validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/C/test.c
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_double_eigenvectors_2stage_default_kernel_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_double_eigenvectors_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_real_double_eigenvectors_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_real_double_eigenvectors_1stage_gpu_random_default.sh
+validate_c_version_real_double_eigenvectors_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_real_double_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_double_eigenvectors_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_double_eigenvectors_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random
+endif
+check_SCRIPTS += validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/C/test.c
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_double_eigenvectors_2stage_default_kernel_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_single_eigenvectors_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_complex_single_eigenvectors_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_complex_single_eigenvectors_1stage_gpu_random_default.sh
+validate_c_version_complex_single_eigenvectors_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_complex_single_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_single_eigenvectors_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_single_eigenvectors_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random
+endif
+check_SCRIPTS += validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/C/test.c
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_single_eigenvectors_2stage_default_kernel_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_single_eigenvectors_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_real_single_eigenvectors_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_real_single_eigenvectors_1stage_gpu_random_default.sh
+validate_c_version_real_single_eigenvectors_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_real_single_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_single_eigenvectors_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_single_eigenvectors_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random
+endif
+check_SCRIPTS += validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/C/test.c
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_single_eigenvectors_2stage_default_kernel_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_double_generalized_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_complex_double_generalized_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_complex_double_generalized_1stage_gpu_random_default.sh
+validate_c_version_complex_double_generalized_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_complex_double_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_double_generalized_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_double_generalized_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_double_generalized_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_real_double_generalized_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_real_double_generalized_1stage_gpu_random_default.sh
+validate_c_version_real_double_generalized_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_real_double_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_double_generalized_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_double_generalized_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_single_generalized_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_complex_single_generalized_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_complex_single_generalized_1stage_gpu_random_default.sh
+validate_c_version_complex_single_generalized_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_complex_single_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_single_generalized_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_single_generalized_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_single_generalized_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_real_single_generalized_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_real_single_generalized_1stage_gpu_random_default.sh
+validate_c_version_real_single_generalized_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_real_single_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_single_generalized_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_single_generalized_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_double_generalized_decomp_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_complex_double_generalized_decomp_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_default.sh
+validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_double_generalized_decomp_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_double_generalized_decomp_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_double_generalized_decomp_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_real_double_generalized_decomp_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_real_double_generalized_decomp_1stage_gpu_random_default.sh
+validate_c_version_real_double_generalized_decomp_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_real_double_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_double_generalized_decomp_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_double_generalized_decomp_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_complex_single_generalized_decomp_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_complex_single_generalized_decomp_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_default.sh
+validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_complex_single_generalized_decomp_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_complex_single_generalized_decomp_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+endif
+
+if ENABLE_C_TESTS
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_c_version_real_single_generalized_decomp_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_c_version_real_single_generalized_decomp_1stage_gpu_random
+endif
+check_SCRIPTS += validate_c_version_real_single_generalized_decomp_1stage_gpu_random_default.sh
+validate_c_version_real_single_generalized_decomp_1stage_gpu_random_SOURCES = test/C/test.c
+validate_c_version_real_single_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_c_version_real_single_generalized_decomp_1stage_gpu_random_CFLAGS = $(test_program_cflags) \
+  -DTEST_CASE=\"validate_c_version_real_single_generalized_decomp_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_analytic_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_analytic_all_layouts_extended.sh
+validate_complex_double_eigenvectors_1stage_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_analytic_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_analytic
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_analytic
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_analytic_default.sh
+validate_complex_double_eigenvectors_1stage_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_analytic_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_analytic_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_analytic_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_analytic
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_analytic
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_analytic_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_analytic
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_analytic
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_analytic_default.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+if WITH_MPI
+if WITH_SCALAPACK_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh
+validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_scalapack_all_analytic_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SCALAPACK_ALL \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_SCALAPACK_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_scalapack_all_analytic
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_scalapack_all_analytic
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_scalapack_all_analytic_default.sh
+validate_complex_double_eigenvectors_scalapack_all_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_scalapack_all_analytic_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_scalapack_all_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_scalapack_all_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SCALAPACK_ALL \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+endif
+
+if WITH_MPI
+if WITH_SCALAPACK_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh
+validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_scalapack_part_analytic_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SCALAPACK_PART \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_SCALAPACK_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_scalapack_part_analytic
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_scalapack_part_analytic
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_scalapack_part_analytic_default.sh
+validate_complex_double_eigenvectors_scalapack_part_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_scalapack_part_analytic_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_scalapack_part_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_scalapack_part_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SCALAPACK_PART \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_analytic_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_analytic_all_layouts_extended.sh
+validate_real_double_eigenvectors_1stage_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_analytic_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_analytic
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_analytic
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_analytic_default.sh
+validate_real_double_eigenvectors_1stage_analytic_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_analytic_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_analytic\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_analytic_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_analytic_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_analytic
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_analytic
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_analytic_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_analytic\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_analytic
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_analytic
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_analytic_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_analytic\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+if WITH_MPI
+if WITH_SCALAPACK_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_extended.sh
+validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_scalapack_all_analytic_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SCALAPACK_ALL \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_SCALAPACK_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_scalapack_all_analytic
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_scalapack_all_analytic
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_scalapack_all_analytic_default.sh
+validate_real_double_eigenvectors_scalapack_all_analytic_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_scalapack_all_analytic_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_scalapack_all_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_scalapack_all_analytic\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SCALAPACK_ALL \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+endif
+
+if WITH_MPI
+if WITH_SCALAPACK_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_extended.sh
+validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_scalapack_part_analytic_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SCALAPACK_PART \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_SCALAPACK_TESTS
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_scalapack_part_analytic
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_scalapack_part_analytic
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_scalapack_part_analytic_default.sh
+validate_real_double_eigenvectors_scalapack_part_analytic_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_scalapack_part_analytic_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_scalapack_part_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_scalapack_part_analytic\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SCALAPACK_PART \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_analytic_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_analytic_all_layouts_extended.sh
+validate_complex_single_eigenvectors_1stage_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_analytic_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_analytic
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_analytic
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_analytic_default.sh
+validate_complex_single_eigenvectors_1stage_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_analytic_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_analytic_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_analytic_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_analytic
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_analytic
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_analytic_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_analytic
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_analytic
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_analytic_default.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_analytic\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_analytic_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_analytic_all_layouts_extended.sh
+validate_real_single_eigenvectors_1stage_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_analytic_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_analytic
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_analytic
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_analytic_default.sh
+validate_real_single_eigenvectors_1stage_analytic_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_analytic_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_analytic\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_analytic_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_analytic_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_analytic
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_analytic
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_analytic_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_analytic\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_ALL_KERNELS
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_analytic
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_analytic
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_analytic_default.sh
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_analytic_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_analytic\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_ANALYTIC \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_1stage_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_1stage_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_1stage_frank_all_layouts_extended.sh
+validate_real_double_eigenvalues_1stage_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_1stage_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_1stage_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_1stage_frank_default.sh
+validate_real_double_eigenvalues_1stage_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_extended.sh
+validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_2stage_default_kernel_frank_default.sh
+validate_real_double_eigenvalues_2stage_default_kernel_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_frank_all_layouts_extended.sh
+validate_real_double_eigenvectors_1stage_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_frank_default.sh
+validate_real_double_eigenvectors_1stage_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_frank_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_KERNELS
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_frank_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_hermitian_multiply_1stage_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_hermitian_multiply_1stage_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_hermitian_multiply_1stage_frank_all_layouts_extended.sh
+validate_real_double_hermitian_multiply_1stage_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_hermitian_multiply_1stage_frank
+else
+noinst_PROGRAMS += validate_real_double_hermitian_multiply_1stage_frank
+endif
+check_SCRIPTS += validate_real_double_hermitian_multiply_1stage_frank_default.sh
+validate_real_double_hermitian_multiply_1stage_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_frank_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_extended.sh
+validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_gpu_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_1stage_gpu_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_1stage_gpu_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_1stage_gpu_frank_default.sh
+validate_real_double_eigenvalues_1stage_gpu_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_gpu_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_gpu_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_extended.sh
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_default.sh
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_gpu_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_extended.sh
+validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_gpu_frank_default.sh
+validate_real_double_eigenvectors_1stage_gpu_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_gpu_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_KERNELS
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts
+endif
+check_SCRIPTS += validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_extended.sh
+validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_gpu_frank_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_hermitian_multiply_1stage_gpu_frank
+else
+noinst_PROGRAMS += validate_real_double_hermitian_multiply_1stage_gpu_frank
+endif
+check_SCRIPTS += validate_real_double_hermitian_multiply_1stage_gpu_frank_default.sh
+validate_real_double_hermitian_multiply_1stage_gpu_frank_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_gpu_frank_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_gpu_frank_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_gpu_frank\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_FRANK
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_cholesky_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_cholesky_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_cholesky_1stage_random_all_layouts_extended.sh
+validate_complex_double_cholesky_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_cholesky_1stage_random
+else
+noinst_PROGRAMS += validate_complex_double_cholesky_1stage_random
+endif
+check_SCRIPTS += validate_complex_double_cholesky_1stage_random_default.sh
+validate_complex_double_cholesky_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_random_all_layouts_extended.sh
+validate_real_double_cholesky_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_random
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_random
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_random_default.sh
+validate_real_double_cholesky_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_random_split_comm_myself
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_random_split_comm_myself
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_random_split_comm_myself_default.sh
+validate_real_double_cholesky_1stage_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_random_split_comm_myself_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_random_split_comm_myself\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DSPLIT_COMM_MYSELF
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_cholesky_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_cholesky_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_cholesky_1stage_random_all_layouts_extended.sh
+validate_complex_single_cholesky_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_cholesky_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_cholesky_1stage_random
+else
+noinst_PROGRAMS += validate_complex_single_cholesky_1stage_random
+endif
+check_SCRIPTS += validate_complex_single_cholesky_1stage_random_default.sh
+validate_complex_single_cholesky_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_cholesky_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_cholesky_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_cholesky_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_cholesky_1stage_random_all_layouts_extended.sh
+validate_real_single_cholesky_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_cholesky_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_cholesky_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_cholesky_1stage_random
+else
+noinst_PROGRAMS += validate_real_single_cholesky_1stage_random
+endif
+check_SCRIPTS += validate_real_single_cholesky_1stage_random_default.sh
+validate_real_single_cholesky_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_single_cholesky_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_cholesky_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_random_all_layouts_extended.sh
+validate_complex_double_eigenvectors_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_random
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_random
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_random_default.sh
+validate_complex_double_eigenvectors_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_random
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_random
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_random_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_random
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_random
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_random_default.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_random_all_layouts_extended.sh
+validate_real_double_eigenvectors_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_random
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_random
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_random_default.sh
+validate_real_double_eigenvectors_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_random_split_comm_myself
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_random_split_comm_myself
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_random_split_comm_myself_default.sh
+validate_real_double_eigenvectors_1stage_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_random_split_comm_myself_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_random_split_comm_myself\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DSPLIT_COMM_MYSELF
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_random
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_random
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_random_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_random
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_random
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_random_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_random_split_comm_myself\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DSPLIT_COMM_MYSELF
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_random_all_layouts_extended.sh
+validate_complex_single_eigenvectors_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_random
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_random
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_random_default.sh
+validate_complex_single_eigenvectors_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_random
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_random
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_random_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_random_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_random
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_random
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_random_default.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_random_all_layouts_extended.sh
+validate_real_single_eigenvectors_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_random
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_random
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_random_default.sh
+validate_real_single_eigenvectors_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_random
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_random
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_random_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_random_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_random_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_random
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_random
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_random_default.sh
+validate_real_single_eigenvectors_2stage_default_kernel_random_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_random_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_generalized_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_generalized_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_generalized_1stage_random_all_layouts_default.sh
+validate_complex_double_generalized_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_generalized_1stage_random
+else
+noinst_PROGRAMS += validate_complex_double_generalized_1stage_random
+endif
+check_SCRIPTS += validate_complex_double_generalized_1stage_random_default.sh
+validate_complex_double_generalized_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_generalized_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_generalized_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_generalized_1stage_random_all_layouts_default.sh
+validate_real_double_generalized_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_generalized_1stage_random
+else
+noinst_PROGRAMS += validate_real_double_generalized_1stage_random
+endif
+check_SCRIPTS += validate_real_double_generalized_1stage_random_default.sh
+validate_real_double_generalized_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_generalized_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_generalized_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_generalized_1stage_random_all_layouts_default.sh
+validate_complex_single_generalized_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_generalized_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_generalized_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_generalized_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_generalized_1stage_random
+else
+noinst_PROGRAMS += validate_complex_single_generalized_1stage_random
+endif
+check_SCRIPTS += validate_complex_single_generalized_1stage_random_default.sh
+validate_complex_single_generalized_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_generalized_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_single_generalized_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_generalized_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_generalized_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_generalized_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_generalized_1stage_random_all_layouts_default.sh
+validate_real_single_generalized_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_generalized_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_generalized_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_generalized_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_generalized_1stage_random
+else
+noinst_PROGRAMS += validate_real_single_generalized_1stage_random
+endif
+check_SCRIPTS += validate_real_single_generalized_1stage_random_default.sh
+validate_real_single_generalized_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_single_generalized_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_single_generalized_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_generalized_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_generalized_decomp_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_generalized_decomp_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_generalized_decomp_1stage_random_all_layouts_extended.sh
+validate_complex_double_generalized_decomp_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_decomp_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_decomp_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_generalized_decomp_1stage_random
+else
+noinst_PROGRAMS += validate_complex_double_generalized_decomp_1stage_random
+endif
+check_SCRIPTS += validate_complex_double_generalized_decomp_1stage_random_default.sh
+validate_complex_double_generalized_decomp_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_decomp_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_generalized_decomp_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_generalized_decomp_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_generalized_decomp_1stage_random_all_layouts_extended.sh
+validate_real_double_generalized_decomp_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_decomp_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_decomp_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_generalized_decomp_1stage_random
+else
+noinst_PROGRAMS += validate_real_double_generalized_decomp_1stage_random
+endif
+check_SCRIPTS += validate_real_double_generalized_decomp_1stage_random_default.sh
+validate_real_double_generalized_decomp_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_decomp_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_generalized_decomp_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_generalized_decomp_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_generalized_decomp_1stage_random_all_layouts_extended.sh
+validate_complex_single_generalized_decomp_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_generalized_decomp_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_generalized_decomp_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_generalized_decomp_1stage_random
+else
+noinst_PROGRAMS += validate_complex_single_generalized_decomp_1stage_random
+endif
+check_SCRIPTS += validate_complex_single_generalized_decomp_1stage_random_default.sh
+validate_complex_single_generalized_decomp_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_single_generalized_decomp_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_generalized_decomp_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_generalized_decomp_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_generalized_decomp_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_generalized_decomp_1stage_random_all_layouts_extended.sh
+validate_real_single_generalized_decomp_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_generalized_decomp_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_generalized_decomp_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_generalized_decomp_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_generalized_decomp_1stage_random
+else
+noinst_PROGRAMS += validate_real_single_generalized_decomp_1stage_random
+endif
+check_SCRIPTS += validate_real_single_generalized_decomp_1stage_random_default.sh
+validate_real_single_generalized_decomp_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_single_generalized_decomp_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_single_generalized_decomp_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_generalized_decomp_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_hermitian_multiply_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_hermitian_multiply_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_hermitian_multiply_1stage_random_all_layouts_extended.sh
+validate_complex_double_hermitian_multiply_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_hermitian_multiply_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_hermitian_multiply_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_hermitian_multiply_1stage_random
+else
+noinst_PROGRAMS += validate_complex_double_hermitian_multiply_1stage_random
+endif
+check_SCRIPTS += validate_complex_double_hermitian_multiply_1stage_random_default.sh
+validate_complex_double_hermitian_multiply_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_hermitian_multiply_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_hermitian_multiply_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_hermitian_multiply_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_hermitian_multiply_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_hermitian_multiply_1stage_random_all_layouts_extended.sh
+validate_real_double_hermitian_multiply_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_hermitian_multiply_1stage_random
+else
+noinst_PROGRAMS += validate_real_double_hermitian_multiply_1stage_random
+endif
+check_SCRIPTS += validate_real_double_hermitian_multiply_1stage_random_default.sh
+validate_real_double_hermitian_multiply_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_hermitian_multiply_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_hermitian_multiply_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_hermitian_multiply_1stage_random_all_layouts_extended.sh
+validate_complex_single_hermitian_multiply_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_hermitian_multiply_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_hermitian_multiply_1stage_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_hermitian_multiply_1stage_random
+else
+noinst_PROGRAMS += validate_complex_single_hermitian_multiply_1stage_random
+endif
+check_SCRIPTS += validate_complex_single_hermitian_multiply_1stage_random_default.sh
+validate_complex_single_hermitian_multiply_1stage_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_hermitian_multiply_1stage_random_LDADD = $(test_program_ldadd)
+validate_complex_single_hermitian_multiply_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_hermitian_multiply_1stage_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_hermitian_multiply_1stage_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_hermitian_multiply_1stage_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_hermitian_multiply_1stage_random_all_layouts_extended.sh
+validate_real_single_hermitian_multiply_1stage_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_hermitian_multiply_1stage_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_hermitian_multiply_1stage_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_hermitian_multiply_1stage_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_hermitian_multiply_1stage_random
+else
+noinst_PROGRAMS += validate_real_single_hermitian_multiply_1stage_random
+endif
+check_SCRIPTS += validate_real_single_hermitian_multiply_1stage_random_default.sh
+validate_real_single_hermitian_multiply_1stage_random_SOURCES = test/Fortran/test.F90
+validate_real_single_hermitian_multiply_1stage_random_LDADD = $(test_program_ldadd)
+validate_real_single_hermitian_multiply_1stage_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_hermitian_multiply_1stage_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_qr_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_qr_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_qr_random
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_qr_random
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_qr_random_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_qr_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_qr_random
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_qr_random
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_qr_random_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_qr_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_qr_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_qr_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_qr_random
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_qr_random
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_qr_random_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_qr_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_qr_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_qr_random
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_qr_random
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_qr_random_default.sh
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_qr_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_qr_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=1 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_cholesky_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_cholesky_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_cholesky_1stage_gpu_random_all_layouts_extended.sh
+validate_complex_double_cholesky_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_cholesky_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_double_cholesky_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_double_cholesky_1stage_gpu_random_default.sh
+validate_complex_double_cholesky_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_gpu_random_all_layouts_extended.sh
+validate_real_double_cholesky_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_gpu_random_default.sh
+validate_real_double_cholesky_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_gpu_random_split_comm_myself
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_gpu_random_split_comm_myself
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_default.sh
+validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_random_split_comm_myself\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DSPLIT_COMM_MYSELF
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_cholesky_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_cholesky_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_cholesky_1stage_gpu_random_all_layouts_extended.sh
+validate_complex_single_cholesky_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_cholesky_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_cholesky_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_single_cholesky_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_single_cholesky_1stage_gpu_random_default.sh
+validate_complex_single_cholesky_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_cholesky_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_cholesky_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_cholesky_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_cholesky_1stage_gpu_random_all_layouts_extended.sh
+validate_real_single_cholesky_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_cholesky_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_cholesky_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_cholesky_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_single_cholesky_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_single_cholesky_1stage_gpu_random_default.sh
+validate_real_single_cholesky_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_single_cholesky_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_cholesky_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_gpu_random_default.sh
+validate_complex_double_eigenvectors_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_gpu_random_default.sh
+validate_real_double_eigenvectors_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_MPI
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_default.sh
+validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_random_split_comm_myself\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DSPLIT_COMM_MYSELF
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_random
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_random
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_MPI
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_random_split_comm_myself\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DSPLIT_COMM_MYSELF
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_gpu_random_default.sh
+validate_complex_single_eigenvectors_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_extended.sh
+validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_gpu_random_default.sh
+validate_real_single_eigenvectors_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_random
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_random
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_KERNELS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_random
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_random
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_default.sh
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_generalized_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_generalized_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_generalized_1stage_gpu_random_all_layouts_default.sh
+validate_complex_double_generalized_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_generalized_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_double_generalized_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_double_generalized_1stage_gpu_random_default.sh
+validate_complex_double_generalized_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_generalized_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_generalized_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_generalized_1stage_gpu_random_all_layouts_default.sh
+validate_real_double_generalized_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_generalized_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_double_generalized_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_double_generalized_1stage_gpu_random_default.sh
+validate_real_double_generalized_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_generalized_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_generalized_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_generalized_1stage_gpu_random_all_layouts_default.sh
+validate_complex_single_generalized_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_generalized_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_generalized_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_generalized_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_single_generalized_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_single_generalized_1stage_gpu_random_default.sh
+validate_complex_single_generalized_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_single_generalized_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_generalized_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_generalized_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_generalized_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_generalized_1stage_gpu_random_all_layouts_default.sh
+validate_real_single_generalized_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_generalized_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_generalized_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_generalized_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_generalized_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_single_generalized_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_single_generalized_1stage_gpu_random_default.sh
+validate_real_single_generalized_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_single_generalized_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_single_generalized_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_generalized_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_decomp_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_generalized_decomp_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_double_generalized_decomp_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_double_generalized_decomp_1stage_gpu_random_default.sh
+validate_complex_double_generalized_decomp_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_double_generalized_decomp_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_generalized_decomp_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_decomp_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_generalized_decomp_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_double_generalized_decomp_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_double_generalized_decomp_1stage_gpu_random_default.sh
+validate_real_double_generalized_decomp_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_double_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_double_generalized_decomp_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_generalized_decomp_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_generalized_decomp_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_generalized_decomp_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_single_generalized_decomp_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_single_generalized_decomp_1stage_gpu_random_default.sh
+validate_complex_single_generalized_decomp_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_single_generalized_decomp_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_generalized_decomp_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_extended.sh
+validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_generalized_decomp_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_generalized_decomp_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_single_generalized_decomp_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_single_generalized_decomp_1stage_gpu_random_default.sh
+validate_real_single_generalized_decomp_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_single_generalized_decomp_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_single_generalized_decomp_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_generalized_decomp_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_GENERALIZED_DECOMP_EIGENPROBLEM \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_hermitian_multiply_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_hermitian_multiply_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_double_hermitian_multiply_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_double_hermitian_multiply_1stage_gpu_random_default.sh
+validate_complex_double_hermitian_multiply_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_double_hermitian_multiply_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_double_hermitian_multiply_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_hermitian_multiply_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_hermitian_multiply_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_double_hermitian_multiply_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_double_hermitian_multiply_1stage_gpu_random_default.sh
+validate_real_double_hermitian_multiply_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_double_hermitian_multiply_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_double_hermitian_multiply_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_hermitian_multiply_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_hermitian_multiply_1stage_gpu_random_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_hermitian_multiply_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_complex_single_hermitian_multiply_1stage_gpu_random
+endif
+check_SCRIPTS += validate_complex_single_hermitian_multiply_1stage_gpu_random_default.sh
+validate_complex_single_hermitian_multiply_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_complex_single_hermitian_multiply_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_complex_single_hermitian_multiply_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_hermitian_multiply_1stage_gpu_random\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts
+endif
+check_SCRIPTS += validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_extended.sh
+validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_hermitian_multiply_1stage_gpu_random_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_hermitian_multiply_1stage_gpu_random
+else
+noinst_PROGRAMS += validate_real_single_hermitian_multiply_1stage_gpu_random
+endif
+check_SCRIPTS += validate_real_single_hermitian_multiply_1stage_gpu_random_default.sh
+validate_real_single_hermitian_multiply_1stage_gpu_random_SOURCES = test/Fortran/test.F90
+validate_real_single_hermitian_multiply_1stage_gpu_random_LDADD = $(test_program_ldadd)
+validate_real_single_hermitian_multiply_1stage_gpu_random_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_hermitian_multiply_1stage_gpu_random\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_HERMITIAN_MULTIPLY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_RANDOM
+endif
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_cholesky_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_cholesky_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_cholesky_1stage_toeplitz_all_layouts_extended.sh
+validate_complex_double_cholesky_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_cholesky_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_cholesky_1stage_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_cholesky_1stage_toeplitz_default.sh
+validate_complex_double_cholesky_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_toeplitz_all_layouts_extended.sh
+validate_real_double_cholesky_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_toeplitz
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_toeplitz_default.sh
+validate_real_double_cholesky_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_cholesky_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_cholesky_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_cholesky_1stage_toeplitz_all_layouts_extended.sh
+validate_complex_single_cholesky_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_cholesky_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_cholesky_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_cholesky_1stage_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_cholesky_1stage_toeplitz_default.sh
+validate_complex_single_cholesky_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_cholesky_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_cholesky_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_cholesky_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_cholesky_1stage_toeplitz_all_layouts_extended.sh
+validate_real_single_cholesky_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_cholesky_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_cholesky_1stage_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_cholesky_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_cholesky_1stage_toeplitz
+endif
+check_SCRIPTS += validate_real_single_cholesky_1stage_toeplitz_default.sh
+validate_real_single_cholesky_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_cholesky_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_cholesky_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_1stage_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvalues_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvalues_1stage_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvalues_1stage_toeplitz_default.sh
+validate_complex_double_eigenvalues_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_2stage_default_kernel_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_1stage_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_1stage_toeplitz_default.sh
+validate_real_double_eigenvalues_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvalues_1stage_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvalues_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvalues_1stage_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvalues_1stage_toeplitz_default.sh
+validate_complex_single_eigenvalues_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvalues_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvalues_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvalues_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvalues_2stage_default_kernel_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvalues_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvalues_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvalues_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvalues_1stage_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvalues_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvalues_1stage_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvalues_1stage_toeplitz_default.sh
+validate_real_single_eigenvalues_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvalues_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvalues_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvalues_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvalues_2stage_default_kernel_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvalues_2stage_default_kernel_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_default.sh
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvalues_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvalues_2stage_default_kernel_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_toeplitz_default.sh
+validate_complex_double_eigenvectors_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_toeplitz_default.sh
+validate_real_double_eigenvectors_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_toeplitz_default.sh
+validate_complex_single_eigenvectors_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+endif
+
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_toeplitz_default.sh
+validate_real_single_eigenvectors_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_default.sh
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh
+validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_solve_tridiagonal_1stage_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_solve_tridiagonal_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_solve_tridiagonal_1stage_toeplitz
+endif
+check_SCRIPTS += validate_real_double_solve_tridiagonal_1stage_toeplitz_default.sh
+validate_real_double_solve_tridiagonal_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_solve_tridiagonal_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_solve_tridiagonal_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_extended.sh
+validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_solve_tridiagonal_1stage_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_solve_tridiagonal_1stage_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_solve_tridiagonal_1stage_toeplitz
+endif
+check_SCRIPTS += validate_real_single_solve_tridiagonal_1stage_toeplitz_default.sh
+validate_real_single_solve_tridiagonal_1stage_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_solve_tridiagonal_1stage_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_solve_tridiagonal_1stage_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_solve_tridiagonal_1stage_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=0 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_cholesky_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_cholesky_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_cholesky_1stage_gpu_toeplitz_default.sh
+validate_complex_double_cholesky_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_cholesky_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_cholesky_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_cholesky_1stage_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_cholesky_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_cholesky_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_double_cholesky_1stage_gpu_toeplitz_default.sh
+validate_real_double_cholesky_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_cholesky_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_cholesky_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_cholesky_1stage_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_cholesky_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_cholesky_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_cholesky_1stage_gpu_toeplitz_default.sh
+validate_complex_single_cholesky_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_cholesky_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_cholesky_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_cholesky_1stage_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_cholesky_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_cholesky_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_cholesky_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_single_cholesky_1stage_gpu_toeplitz_default.sh
+validate_real_single_cholesky_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_cholesky_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_cholesky_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_cholesky_1stage_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_CHOLESKY \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvalues_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvalues_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvalues_1stage_gpu_toeplitz_default.sh
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_1stage_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvalues_2stage_default_kernel_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_1stage_gpu_toeplitz_default.sh
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_1stage_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvalues_2stage_default_kernel_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvalues_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvalues_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvalues_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvalues_1stage_gpu_toeplitz_default.sh
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvalues_1stage_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvalues_2stage_default_kernel_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvalues_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvalues_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvalues_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvalues_1stage_gpu_toeplitz_default.sh
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvalues_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvalues_1stage_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_default.sh
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvalues_2stage_default_kernel_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVALUES \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_1stage_gpu_toeplitz_default.sh
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_1stage_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_all_kernels_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_double_eigenvectors_2stage_default_kernel_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_1stage_gpu_toeplitz_default.sh
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_1stage_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_all_kernels_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_eigenvectors_2stage_default_kernel_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_1stage_gpu_toeplitz_default.sh
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_1stage_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_all_kernels_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_COMPLEX
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz
+endif
+check_SCRIPTS += validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_complex_single_eigenvectors_2stage_default_kernel_gpu_toeplitz\" \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_COMPLEX_DEFAULT
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_1stage_gpu_toeplitz_default.sh
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_1stage_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_extended.sh
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_extended.sh
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_all_kernels_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_KERNELS
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_default.sh
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_eigenvectors_2stage_default_kernel_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_EIGENVECTORS \
+  -DTEST_SOLVER_2STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_KERNEL=ELPA_2STAGE_REAL_DEFAULT
+endif
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+
+if WITH_GPU_VERSION
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_default.sh
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_double_solve_tridiagonal_1stage_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_DOUBLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+
+if WITH_GPU_VERSION
+if WITH_MPI
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts
+else
+noinst_PROGRAMS += validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts
+endif
+check_SCRIPTS += validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_extended.sh
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_SOURCES = test/Fortran/test.F90
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_LDADD = $(test_program_ldadd)
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_all_layouts\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ \
+  -DTEST_ALL_LAYOUTS
+endif
+endif
+endif
+
+if WITH_GPU_VERSION
+if WANT_SINGLE_PRECISION_REAL
+if BUILD_KCOMPUTER
+bin_PROGRAMS += validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz
+else
+noinst_PROGRAMS += validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz
+endif
+check_SCRIPTS += validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_default.sh
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_SOURCES = test/Fortran/test.F90
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_LDADD = $(test_program_ldadd)
+validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_CASE=\"validate_real_single_solve_tridiagonal_1stage_gpu_toeplitz\" \
+  -DTEST_REAL \
+  -DTEST_SINGLE \
+  -DTEST_SOLVE_TRIDIAGONAL \
+  -DTEST_SOLVER_1STAGE \
+  -DTEST_GPU=1 \
+  -DTEST_QR_DECOMPOSITION=0 \
+  -DTEST_MATRIX_TOEPLITZ
+endif
+endif
+
+if ENABLE_AUTOTUNING
+if ENABLE_C_TESTS
+check_SCRIPTS += validate_autotune_c_version_complex_double_extended.sh
+noinst_PROGRAMS += validate_autotune_c_version_complex_double
+validate_autotune_c_version_complex_double_SOURCES = test/C/test_autotune.c
+validate_autotune_c_version_complex_double_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_autotune_c_version_complex_double_CFLAGS = $(test_program_cflags) \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE
+
+endif
+endif
+if ENABLE_AUTOTUNING
+if ENABLE_C_TESTS
+check_SCRIPTS += validate_autotune_c_version_real_double_extended.sh
+noinst_PROGRAMS += validate_autotune_c_version_real_double
+validate_autotune_c_version_real_double_SOURCES = test/C/test_autotune.c
+validate_autotune_c_version_real_double_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_autotune_c_version_real_double_CFLAGS = $(test_program_cflags) \
+  -DTEST_REAL \
+  -DTEST_DOUBLE
+
+endif
+endif
+if WANT_SINGLE_PRECISION_COMPLEX
+if ENABLE_AUTOTUNING
+if ENABLE_C_TESTS
+check_SCRIPTS += validate_autotune_c_version_complex_single_extended.sh
+noinst_PROGRAMS += validate_autotune_c_version_complex_single
+validate_autotune_c_version_complex_single_SOURCES = test/C/test_autotune.c
+validate_autotune_c_version_complex_single_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_autotune_c_version_complex_single_CFLAGS = $(test_program_cflags) \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE
+endif
+
+endif
+endif
+if WANT_SINGLE_PRECISION_REAL
+if ENABLE_AUTOTUNING
+if ENABLE_C_TESTS
+check_SCRIPTS += validate_autotune_c_version_real_single_extended.sh
+noinst_PROGRAMS += validate_autotune_c_version_real_single
+validate_autotune_c_version_real_single_SOURCES = test/C/test_autotune.c
+validate_autotune_c_version_real_single_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_autotune_c_version_real_single_CFLAGS = $(test_program_cflags) \
+  -DTEST_REAL \
+  -DTEST_SINGLE
+endif
+
+endif
+endif
+if ENABLE_AUTOTUNING
+check_SCRIPTS += validate_autotune_complex_double_extended.sh
+noinst_PROGRAMS += validate_autotune_complex_double
+validate_autotune_complex_double_SOURCES = test/Fortran/test_autotune.F90
+validate_autotune_complex_double_LDADD = $(test_program_ldadd)
+validate_autotune_complex_double_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_COMPLEX \
+  -DTEST_DOUBLE
+
+endif
+if ENABLE_AUTOTUNING
+check_SCRIPTS += validate_autotune_real_double_extended.sh
+noinst_PROGRAMS += validate_autotune_real_double
+validate_autotune_real_double_SOURCES = test/Fortran/test_autotune.F90
+validate_autotune_real_double_LDADD = $(test_program_ldadd)
+validate_autotune_real_double_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_REAL \
+  -DTEST_DOUBLE
+
+endif
+if WANT_SINGLE_PRECISION_COMPLEX
+if ENABLE_AUTOTUNING
+check_SCRIPTS += validate_autotune_complex_single_extended.sh
+noinst_PROGRAMS += validate_autotune_complex_single
+validate_autotune_complex_single_SOURCES = test/Fortran/test_autotune.F90
+validate_autotune_complex_single_LDADD = $(test_program_ldadd)
+validate_autotune_complex_single_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_COMPLEX \
+  -DTEST_SINGLE
+endif
+
+endif
+if WANT_SINGLE_PRECISION_REAL
+if ENABLE_AUTOTUNING
+check_SCRIPTS += validate_autotune_real_single_extended.sh
+noinst_PROGRAMS += validate_autotune_real_single
+validate_autotune_real_single_SOURCES = test/Fortran/test_autotune.F90
+validate_autotune_real_single_LDADD = $(test_program_ldadd)
+validate_autotune_real_single_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_REAL \
+  -DTEST_SINGLE
+endif
+
+endif
+if ENABLE_AUTOTUNING
+check_SCRIPTS += validate_multiple_objs_real_double_extended.sh
+noinst_PROGRAMS += validate_multiple_objs_real_double
+validate_multiple_objs_real_double_SOURCES = test/Fortran/test_multiple_objs.F90
+validate_multiple_objs_real_double_LDADD = $(test_program_ldadd)
+validate_multiple_objs_real_double_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_REAL \
+  -DTEST_DOUBLE
+endif
+check_SCRIPTS += test_skewsymmetric_real_double_extended.sh
+noinst_PROGRAMS += test_skewsymmetric_real_double
+test_skewsymmetric_real_double_SOURCES = test/Fortran/test_skewsymmetric.F90
+test_skewsymmetric_real_double_LDADD = $(test_program_ldadd)
+test_skewsymmetric_real_double_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_REAL \
+  -DTEST_DOUBLE
+if WANT_SINGLE_PRECISION_REAL
+check_SCRIPTS += test_skewsymmetric_real_single_extended.sh
+noinst_PROGRAMS += test_skewsymmetric_real_single
+test_skewsymmetric_real_single_SOURCES = test/Fortran/test_skewsymmetric.F90
+test_skewsymmetric_real_single_LDADD = $(test_program_ldadd)
+test_skewsymmetric_real_single_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_REAL \
+  -DTEST_SINGLE
+endif
+if ENABLE_C_TESTS
+if ENABLE_AUTOTUNING
+check_SCRIPTS += validate_multiple_objs_real_double_c_version_extended.sh
+noinst_PROGRAMS += validate_multiple_objs_real_double_c_version
+validate_multiple_objs_real_double_c_version_SOURCES = test/C/test_multiple_objs.c
+validate_multiple_objs_real_double_c_version_LDADD = $(test_program_ldadd) $(FCLIBS)
+validate_multiple_objs_real_double_c_version_CFLAGS = $(test_program_cflags) \
+  -DTEST_REAL \
+  -DTEST_DOUBLE
+endif
+endif
+check_SCRIPTS += validate_split_comm_real_double_extended.sh
+noinst_PROGRAMS += validate_split_comm_real_double
+validate_split_comm_real_double_SOURCES = test/Fortran/test_split_comm.F90
+validate_split_comm_real_double_LDADD = $(test_program_ldadd)
+validate_split_comm_real_double_FCFLAGS = $(test_program_fcflags) \
+  -DTEST_REAL \
+  -DTEST_DOUBLE
diff -Nru elpa-2016.05.001/test_project_1stage/autogen.sh elpa-2019.11.001/test_project_1stage/autogen.sh
--- elpa-2016.05.001/test_project_1stage/autogen.sh	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/autogen.sh	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+
+mkdir -p m4/
+
+test -n "$srcdir" || srcdir=`dirname "$0"`
+test -n "$srcdir" || srcdir=.
+
+autoreconf --force --install --verbose "$srcdir"
diff -Nru elpa-2016.05.001/test_project_1stage/configure.ac elpa-2019.11.001/test_project_1stage/configure.ac
--- elpa-2016.05.001/test_project_1stage/configure.ac	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/configure.ac	2019-12-20 05:57:47.000000000 +0000
@@ -0,0 +1,94 @@
+AC_PREREQ([2.69])
+AC_INIT([elpa_test_project],[2019.11.001], elpa-library@rzg.mpg.de)
+elpaversion="2019.11.001"
+AC_CONFIG_SRCDIR([src/test_real.F90])
+
+AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
+
+# Without this, automake tries to be smart and rebuilt
+# the autoconf generated files such as configure, aclocal.m4, etc.,
+# in case the timestamps of files such as configure.ac are newer
+#
+# This only makes trouble for end users with out-of-date autoconf versions
+# that cannot produce these files
+AM_MAINTAINER_MODE([disable])
+
+AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_HEADERS([config.h])
+AM_SILENT_RULES([yes])
+
+rm -rf config.h config-f90.h
+
+AX_CHECK_GNU_MAKE()
+if test x$_cv_gnu_make_command = x ; then
+        AC_MSG_ERROR([Need GNU Make])
+fi
+
+AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
+if test x"${CPP_FOUND}" = xno; then
+  AC_MSG_ERROR([no cpp found])
+fi
+
+# gnu-make fortran module dependencies
+m4_include([fdep/fortran_dependencies.m4])
+FDEP_F90_GNU_MAKE_DEPS
+
+AC_PROG_INSTALL
+AM_PROG_CC_C_O
+AM_PROG_AR
+AM_PROG_AS
+
+AC_LANG([Fortran])
+m4_include([m4/ax_prog_fc_mpi.m4])
+
+dnl check whether an mpi compiler is available;
+dnl if not abort since it is mandatory
+AX_PROG_FC_MPI([],[have_mpi=yes],[have_mpi=no
+		 if test x"${have_mpi}" = xno; then
+  		  AC_MSG_ERROR([no mpi found])
+		  fi])
+
+AC_FC_FREEFORM
+AC_FC_MODULE_FLAG
+AC_FC_MODULE_OUTPUT_FLAG
+
+AC_MSG_CHECKING(whether OpenMP usage is specified)
+AC_ARG_WITH([openmp],
+		AS_HELP_STRING([--with-openmp],
+			       [use OpenMP threading, default no.]),
+	      [with_openmp=yes],
+	      [with_openmp=no])
+  AC_MSG_RESULT([${with_openmp}])
+  if test x"${enable_openmp}" = x"yes"; then
+     with_openmp=yes
+     AC_MSG_CHECKING(whether --enable-openmp is specified)
+     AC_MSG_RESULT([${enable_openmp}])
+  fi
+  AM_CONDITIONAL([WITH_OPENMP],[test x"$with_openmp" = x"yes"])
+  if test x"${with_openmp}" = x"yes"; then
+	AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
+        AX_ELPA_OPENMP
+	elpa="elpa_openmp-$elpaversion"
+  else
+	elpa="elpa-$elpaversion"
+  fi
+
+# Here comes the ELPA specific part
+PKG_PROG_PKG_CONFIG
+PKG_CHECK_MODULES([ELPA],[${elpa}],[],[AC_MSG_ERROR(["Need ${elpa}"])])
+PKG_CHECK_VAR([ELPA_FCFLAGS],[${elpa}],[fcflags])
+
+LT_INIT
+
+AC_SUBST([FC_MODINC])
+AC_SUBST([FC_MODOUT])
+
+rm -rf modules/ .fortran_dependencies/
+mkdir modules
+
+AC_CONFIG_FILES([
+  Makefile
+])
+AC_OUTPUT
+
+grep "^#define" config.h > config-f90.h
diff -Nru elpa-2016.05.001/test_project_1stage/fdep/fortran_dependencies.m4 elpa-2019.11.001/test_project_1stage/fdep/fortran_dependencies.m4
--- elpa-2016.05.001/test_project_1stage/fdep/fortran_dependencies.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/fdep/fortran_dependencies.m4	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,19 @@
+AC_DEFUN([FDEP_F90_GNU_MAKE_DEPS],[
+AC_MSG_CHECKING([for GNU make])
+for a in "$MAKE" make gmake gnumake ; do
+        if test -z "$a" ; then continue ; fi ;
+        if  ( sh -c "$a --version" 2> /dev/null | grep GNU  2>&1 > /dev/null ) ;  then
+                _fdep_gnu_make_command=$a ;
+                break;
+        fi
+done ;
+AC_MSG_RESULT([$_fdep_gnu_make_command])
+if test x$_fdep_gnu_make_command = x ; then
+	AC_MSG_ERROR([Need GNU Make])
+fi
+AC_SUBST([FORTRAN_MODULE_DEPS], ["
+CLEANFILES +=
+include ${srcdir}/fdep/fortran_dependencies.mk
+"])
+AM_SUBST_NOTMAKE([FORTRAN_MODULE_DEPS])
+])
diff -Nru elpa-2016.05.001/test_project_1stage/fdep/fortran_dependencies.mk elpa-2019.11.001/test_project_1stage/fdep/fortran_dependencies.mk
--- elpa-2016.05.001/test_project_1stage/fdep/fortran_dependencies.mk	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/fdep/fortran_dependencies.mk	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,80 @@
+_f90_verbose = $(_f90_verbose_$(V))
+_f90_verbose_ = $(_f90_verbose_$(AM_DEFAULT_VERBOSITY))
+_f90_verbose_0 = @echo "  $1";
+_f90_targets = $(subst -,_,$(patsubst %.la,%_la,$(patsubst %.a,%_a,$(patsubst %.so,%_so,$(PROGRAMS) $(LTLIBRARIES)))))
+FORTRAN_CPP ?= cpp -P -traditional -Wall -Werror
+
+# $1 source files
+#
+# returns: file without any .F90 .f90 .F .f extension
+define strip_fortran_ext
+$(patsubst %.F90,%,$(patsubst %.f90,%,$(patsubst %.F,%,$(patsubst %.f,%,$1))))
+endef
+
+# $1 program
+#
+# returns:
+#  '1' if object files for target $1 are prefixed due to 'per-target' flags,
+#  '' (the empty string) otherwise. See the automake manual for 'per-target'
+#  compilation
+#
+define is_per_target
+$(if $(filter $(call strip_fortran_ext,$(firstword $(call fortran_sources,$1))),$(patsubst %.o,%,$(patsubst %.lo,%,$($1_OBJECTS)))),,1)
+endef
+
+# $1 top-level target name (i.e. an entry of _f90_targets)
+#
+# returns: all target source files matching *.F90 *.f90 *.F *.f
+define fortran_sources
+$(filter %.F90 %.f90 %.F %.f,$($1_SOURCES))
+endef
+
+# $1 top-level target name
+#
+# returns: the appropriate extension (i.e. 'o' for normal programs, '.lo' for libraries)
+define object_extension
+$(if $(filter $1,$(PROGRAMS)),o,lo)
+endef
+
+# $1 source_file
+# $2 stem
+# $3 program
+define module_targets
+$(eval _$3_use_mods += $(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).use_mods.$3.$(call object_extension,$3))
+$(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).use_mods.$3.$(call object_extension,$3): $1 $(dir $1)$(am__dirstamp)
+	$(call _f90_verbose,F90 USE  [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | grep -i -o '^ *use [^ ,!:]*' | sort -u > $$@
+
+$(eval _$3_def_mods += $(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).def_mods.$3.$(call object_extension,$3))
+$(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).def_mods.$3.$(call object_extension,$3): $1 $(dir $1)$(am__dirstamp)
+	$(call _f90_verbose,F90 MOD  [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | grep -i -o '^ *module [^!]*' | grep -v "\<procedure\>" > $$@ || true
+
+endef
+$(foreach p,$(_f90_targets),$(if $(call is_per_target,$p),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,$p-,$p))),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,,$p)))))
+
+_f90_depdir=$(abs_builddir)/.fortran_dependencies
+_f90_depfile = $(_f90_depdir)/dependencies.mk
+
+define is_clean
+$(if $(filter-out mostlyclean clean distclean maintainer-clean,$(MAKECMDGOALS)),0,1)
+endef
+
+define _fdep_newline
+
+
+endef
+
+ifneq ($(call is_clean),1)
+include $(_f90_depfile)
+endif
+$(_f90_depfile): $(top_srcdir)/fdep/fortran_dependencies.pl $(foreach p,$(_f90_targets),$(_$p_use_mods) $(_$p_def_mods)) | $(foreach p,$(_f90_targets),$(_f90_depdir)/$p)
+	$(call _f90_verbose,F90 DEPS $@)echo > $@; $(foreach p,$(_f90_targets),$(top_srcdir)/fdep/fortran_dependencies.pl $p $(_$p_use_mods) $(_$p_def_mods) >> $@ || { rm $@; exit 1; } ;$(_fdep_newline))
+
+$(_f90_depdir):
+	@mkdir $@
+
+$(foreach p,$(_f90_targets),$(_f90_depdir)/$p): | $(_f90_depdir)
+	@mkdir $@
+
+CLEANFILES += $(foreach p,$(_f90_targets),$(_$p_def_mods) $(_$p_use_mods))
+CLEANFILES += $(foreach p,$(_f90_targets),$(_f90_depdir)/$p/*)
+CLEANFILES += $(_f90_depfile)
diff -Nru elpa-2016.05.001/test_project_1stage/fdep/fortran_dependencies.pl elpa-2019.11.001/test_project_1stage/fdep/fortran_dependencies.pl
--- elpa-2016.05.001/test_project_1stage/fdep/fortran_dependencies.pl	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/fdep/fortran_dependencies.pl	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,77 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my %defs = ();
+my %uses = ();
+
+my $use_re = qr/^\s*use\s+(\S+)\s*$/;
+my $def_re = qr/^\s*module\s+(\S+)\s*$/;
+
+sub add_use {
+	my ($file, $module) = @_;
+	if (defined($defs{$module}) && $defs{$module} eq $file) {
+		# do not add self-dependencies
+		return;
+	}
+	if (!defined($uses{$file})) {
+		$uses{$file} = { $module => 1 };
+	} else {
+		$uses{$file}{$module} = 1;
+	}
+}
+
+sub add_def {
+	my ($file, $module) = @_;
+	if (!defined($defs{$module})) {
+		$defs{$module} = $file;
+		if (defined($uses{$file}) && defined($uses{$file}{$module})) {
+			delete $uses{$file}{$module};
+		}
+	} else {
+		die "Module $module both defined in $file, $defs{$module}";
+	}
+}
+
+my $p = shift;
+
+foreach my $file (@ARGV) {
+	my $re;
+	my $add;
+	my $object;
+	if (defined($ENV{V}) && $ENV{V} ge "2") {
+		print STDERR "fdep: Considering file $file\n";
+	}
+	if ($file =~ /^(.*)\.def_mods.$p(\..*)$/) {
+		$re = $def_re;
+		$add = \&add_def;
+		$object = $1 . $2;
+	} elsif ($file =~ /^(.*)\.use_mods.$p(\..*)$/) {
+		$re = $use_re;
+		$add = \&add_use;
+		$object = $1 . $2;
+	} else {
+		die "Unrecognized file extension for '$file'\nExpected (.*)\.def_mods.$p(\..*) or (.*)\.use_mods.$p(\..*)";
+	}
+	open(FILE,"<",$file) || die "\nCan't open $file: $!\n\n";
+	while(<FILE>) {
+		chomp;
+		$_ = lc($_);
+		if ($_ =~ $re) {
+			&$add($object, $1);
+		} else {
+			die "Cannot parse module statement '$_', was expecting $re";
+		}
+	}
+	close(FILE)
+}
+
+foreach my $object (sort keys %uses) {
+	for my $m (keys %{$uses{$object}}) {
+		if (defined $defs{$m}) {
+			print "$object: ", $defs{$m}, "\n";
+		} elsif (defined($ENV{V}) && $ENV{V} ge "1") {
+			print STDERR "fdep: Warning: Cannot find definition of module $m in files for program $p, might be external\n";
+		}
+	}
+}
diff -Nru elpa-2016.05.001/test_project_1stage/fdep/LICENSE elpa-2019.11.001/test_project_1stage/fdep/LICENSE
--- elpa-2016.05.001/test_project_1stage/fdep/LICENSE	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/fdep/LICENSE	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,19 @@
+Copyright (c) 2013 Lorenz Hüdepohl
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff -Nru elpa-2016.05.001/test_project_1stage/fdep/README elpa-2019.11.001/test_project_1stage/fdep/README
--- elpa-2016.05.001/test_project_1stage/fdep/README	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/fdep/README	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,99 @@
+fdep
+----
+
+fdep is a small set of scripts to teach autoconf/automake (using GNU make)
+about the additional dependencies in Fortran 90 files due to modules.
+
+With this, Fortran files can be listed in any order in Makefile.am and parallel
+builds work.
+
+
+Usage
+-----
+
+  Put this project as a directory "fdep" in your source code, place the two
+  lines
+
+    m4_include([fdep/fortran_dependencies.m4])
+    FDEP_F90_GNU_MAKE_DEPS
+
+  in your configure.ac, and add a single line
+
+    @FORTRAN_MODULE_DEPS@
+
+  in your Makefile.am. All .F90 files of all programs in bin_PROGRAMS and all
+  libraries in lib_LTLIBRARIES will now be scanned for modules and the
+  resulting dependencies will be honoured.
+
+
+What is the problem with Fortran 90 modules and make dependencies?
+------------------------------------------------------------------
+
+  In Fortran 90 source files one can define any number of "modules", containing
+  variable and function definitions. The names of the modules defined in a file
+  can be arbitrary.
+
+  In another source file these modules can be used, informing the Fortran
+  compiler about the definitions in these modules (e.g. to do type-checking).
+  This creates a problem, as the compiler has to know somehow where the module
+  is defined.
+
+  The usual solution employed by almost every Fortran compiler is to create
+  special "module" files for each module contained in a source file during
+  compilation. Their file name is derived by a compiler-specific recipe of the
+  modules identifier (usually the lower-cased module's identifier plus ".mod",
+  so "foo_module.mod" and "some_other_module.mod"). When the compiler
+  encounters a "use" statement during the compilation of another file, it
+  confers to this file to import the definitions of the module.
+
+  That means, you cannot compile files using modules defined in yet un-compiled
+  files, one has to tell make about this dependency.
+
+  (A primitive solution to this problem is listing the file in a pre-sorted
+   order, so that files defining modules are compiled first.
+
+   However, that way the dependency-graph make knows about is incomplete and
+   parallel builds will fail with a high probability)
+
+
+How does fdep solve this problem technically?
+---------------------------------------------
+
+  As the name of the module files can be an arbitrary (and some compilers might
+  even save the module definitions in some completely different way), fdep
+  tells make about the module dependencies as a relation directly between
+  object files, e.g. when a file 'b.f90' is using any module of file 'a.f90',
+  fdep adds a dependency of
+
+    b.o: a.o
+
+
+  More specifically, the perl-script fortran_dependencies.pl is run by make to
+  create a file .fortran_dependencies/dependencies.mk, which is then included.
+  To do this, first every source file (for every defined program and library)
+  is scanned for lines with "module" or "use" statements. These are saved in
+  two additional files (.use_mods and .def_mods) per source file and contain
+  lists of defined and required modules. The perl script then reads these in
+  and produces the appropriate rules.
+
+
+Drawbacks
+---------
+
+  GNU make is required. The detailed dependency graph due to "module" and "use"
+  statements is only available after pre-processing, when autoconf and even
+  configure is long over. To still get proper dependencies, fdep uses GNU
+  make's feature to include generated sub-Makefiles during a running make
+  invocation.
+
+
+License
+-------
+
+  fdep is released under the MIT License. See the LICENSE file for details.
+
+
+Contributing
+------------
+
+  Send your patches or pull-request to dev@stellardeath.org
diff -Nru elpa-2016.05.001/test_project_1stage/m4/ax_prog_fc_mpi.m4 elpa-2019.11.001/test_project_1stage/m4/ax_prog_fc_mpi.m4
--- elpa-2016.05.001/test_project_1stage/m4/ax_prog_fc_mpi.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/m4/ax_prog_fc_mpi.m4	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,162 @@
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_prog_fc_mpi.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_FC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]])
+#
+# DESCRIPTION
+#
+#   This macro tries to find out how to compile Fortran77 programs that use
+#   MPI (Message Passing Interface), a standard API for parallel process
+#   communication (see http://www-unix.mcs.anl.gov/mpi/).  The macro has to
+#   be used instead of the standard macro AC_PROG_FC and will replace the
+#   standard variable FC with the found compiler.
+#
+#   MPI-WANTED-TEST is used to test whether MPI is actually wanted by the
+#   user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will
+#   try to find out how to use MPI, if it fails, the macro will call
+#   AC_PROG_CC to find a standard C compiler instead.
+#
+#   When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found
+#   (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If
+#   ACTION-IF-FOUND is not set, the macro will define HAVE_MPI.
+#
+#   The following example demonstrates usage of the macro:
+#
+#     # If --with-mpi=auto is used, try to find MPI, but use standard FC compiler if it is not found.
+#     # If --with-mpi=yes is used, try to find MPI and fail if it isn't found.
+#     # If --with-mpi=no is used, use a standard FC compiler instead.
+#     AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi],
+#         [compile with MPI (parallelization) support. If none is found,
+#         MPI is not used. Default: auto])
+#     ],,[with_mpi=auto])
+#
+#     AX_PROG_FC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[
+#       use_mpi=no
+#       if test x"$with_mpi" = xyes; then
+#         AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.])
+#       else
+#         AC_MSG_WARN([No MPI compiler found, won't use MPI.])
+#       fi
+#     ])
+#
+# LICENSE
+#
+#   Copyright (c) 2010,2011 Olaf Lenz <olenz@icp.uni-stuttgart.de>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 2
+
+AC_DEFUN([AX_PROG_FC_MPI], [
+AC_PREREQ(2.50)
+
+# Check for compiler
+# Needs to be split off into an extra macro to ensure right expansion
+# order.
+AC_REQUIRE([_AX_PROG_FC_MPI],[_AX_PROG_FC_MPI([$1])])
+
+AS_IF([test x"$_ax_prog_fc_mpi_mpi_wanted" = xno],
+  [ _ax_prog_fc_mpi_mpi_found=no ],
+  [
+    AC_LANG_PUSH([Fortran])
+
+    # test whether MPI_INIT is available
+    # We do not use AC_SEARCH_LIBS here, as it caches its outcome and
+    # thus disallows corresponding calls in the other AX_PROG_*_MPI
+    # macros.
+    for lib in NONE mpichf90 fmpi fmpich; do
+      save_LIBS=$LIBS
+      if test x"$lib" = xNONE; then
+        AC_MSG_CHECKING([for function MPI_INIT])
+      else
+        AC_MSG_CHECKING([for function MPI_INIT in -l$lib])
+        LIBS="-l$lib $LIBS"
+      fi
+      AC_LINK_IFELSE([AC_LANG_CALL([],[MPI_INIT])],
+        [ _ax_prog_fc_mpi_mpi_found=yes ],
+        [ _ax_prog_fc_mpi_mpi_found=no ])
+      AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_found)
+      if test "x$_ax_prog_fc_mpi_mpi_found" = "xyes"; then
+        break;
+      fi
+      LIBS=$save_LIBS
+    done
+
+    # Check for header
+    AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [
+      AC_MSG_CHECKING([for mpif.h])
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[[
+      include 'mpif.h'
+]])],
+        [ AC_MSG_RESULT(yes)],
+        [ AC_MSG_RESULT(no)
+	  _ax_prog_fc_mpi_mpi_found=no
+      ])
+    ])
+    AC_LANG_POP([Fortran])
+])
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [
+        ifelse([$2],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$2])
+        :
+],[
+        $3
+        :
+])
+
+])dnl AX_PROG_FC_MPI
+
+dnl _AX_PROG_FC_MPI is an internal macro required by AX_PROG_FC_MPI.
+dnl To ensure the right expansion order, the main function AX_PROG_FC_MPI
+dnl has to be split into two parts. This part looks for the MPI
+dnl compiler, while the other one tests whether an MPI program can be
+dnl compiled.
+dnl
+AC_DEFUN([_AX_PROG_FC_MPI], [
+  AC_ARG_VAR(MPIFC,[MPI Fortran compiler command])
+  ifelse([$1],,[_ax_prog_fc_mpi_mpi_wanted=yes],[
+    AC_MSG_CHECKING([whether to compile using MPI])
+    if $1; then
+      _ax_prog_fc_mpi_mpi_wanted=yes
+    else
+      _ax_prog_fc_mpi_mpi_wanted=no
+    fi
+    AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_wanted)
+  ])
+  if test x"$_ax_prog_fc_mpi_mpi_wanted" = xyes; then
+    if test -z "$FC" && test -n "$MPIFC"; then
+      FC="$MPIFC"
+    else
+      AC_CHECK_TOOLS([FC], [mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77])
+    fi
+  fi
+  AC_PROG_FC
+])dnl _AX_PROG_FC_MPI
diff -Nru elpa-2016.05.001/test_project_1stage/Makefile.am elpa-2019.11.001/test_project_1stage/Makefile.am
--- elpa-2016.05.001/test_project_1stage/Makefile.am	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/Makefile.am	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,13 @@
+## Process this file with automake to produce Makefile.in
+
+ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
+
+AM_FCFLAGS = @FC_MODINC@modules $(ELPA_FCFLAGS)
+AM_LDFLAGS = $(ELPA_LIBS)
+
+#bindir = $(abs_top_builddir)
+bin_PROGRAMS = test_real
+test_real_SOURCES = src/test_real.F90
+
+distclean-local:
+	-rm config-f90.h
diff -Nru elpa-2016.05.001/test_project_1stage/src/test_real.F90 elpa-2019.11.001/test_project_1stage/src/test_real.F90
--- elpa-2016.05.001/test_project_1stage/src/test_real.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_1stage/src/test_real.F90	2019-12-19 09:47:43.000000000 +0000
@@ -0,0 +1,237 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+!>
+!> Fortran test programm to demonstrates the use of
+!> ELPA 1 real case library.
+!> If "HAVE_REDIRECT" was defined at build time
+!> the stdout and stderr output of each MPI task
+!> can be redirected to files if the environment
+!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
+!> to "true".
+!>
+!> By calling executable [arg1] [arg2] [arg3] [arg4]
+!> one can define the size (arg1), the number of
+!> Eigenvectors to compute (arg2), and the blocking (arg3).
+!> If these values are not set default values (4000, 1500, 16)
+!> are choosen.
+!> If these values are set the 4th argument can be
+!> "output", which specifies that the EV's are written to
+!> an ascii file.
+!>
+program test_real_example
+
+!-------------------------------------------------------------------------------
+! Standard eigenvalue problem - REAL version
+!
+! This program demonstrates the use of the ELPA module
+! together with standard scalapack routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+!-------------------------------------------------------------------------------
+
+   use iso_c_binding
+
+   use elpa
+
+#ifdef HAVE_MPI_MODULE
+   use mpi
+   implicit none
+#else
+   implicit none
+   include 'mpif.h'
+#endif
+
+   !-------------------------------------------------------------------------------
+   ! Please set system size parameters below!
+   ! na:   System size
+   ! nev:  Number of eigenvectors to be calculated
+   ! nblk: Blocking factor in block cyclic distribution
+   !-------------------------------------------------------------------------------
+
+   integer           :: nblk
+   integer                          :: na, nev
+
+   integer                          :: np_rows, np_cols, na_rows, na_cols
+
+   integer                          :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
+   integer                          :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+
+   integer, external                :: numroc
+
+   real(kind=c_double), allocatable :: a(:,:), z(:,:), ev(:)
+
+   integer                          :: iseed(4096) ! Random seed, size should be sufficient for every generator
+
+   integer                          :: STATUS
+   integer                          :: success
+   character(len=8)                 :: task_suffix
+   integer                          :: j
+
+   integer, parameter               :: error_units = 0
+
+   class(elpa_t), pointer           :: e
+   !-------------------------------------------------------------------------------
+
+
+   ! default parameters
+   na = 1000
+   nev = 500
+   nblk = 16
+
+   call mpi_init(mpierr)
+   call mpi_comm_rank(mpi_comm_world,myid,mpierr)
+   call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
+
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+     if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   ! at the end of the above loop, nprocs is always divisible by np_cols
+
+   np_rows = nprocs/np_cols
+
+   ! initialise BLACS
+   my_blacs_ctxt = mpi_comm_world
+   call BLACS_Gridinit(my_blacs_ctxt, 'C', np_rows, np_cols)
+   call BLACS_Gridinfo(my_blacs_ctxt, nprow, npcol, my_prow, my_pcol)
+
+   if (myid==0) then
+     print '(a)','| Past BLACS_Gridinfo.'
+   end if
+   ! determine the neccessary size of the distributed matrices,
+   ! we use the scalapack tools routine NUMROC
+
+   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
+   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
+
+
+   ! set up the scalapack descriptor for the checks below
+   ! For ELPA the following restrictions hold:
+   ! - block sizes in both directions must be identical (args 4 a. 5)
+   ! - first row and column of the distributed matrix must be on
+   !   row/col 0/0 (arg 6 and 7)
+
+   call descinit(sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info)
+
+   if (info .ne. 0) then
+     write(error_units,*) 'Error in BLACS descinit! info=',info
+     write(error_units,*) 'Most likely this happend since you want to use'
+     write(error_units,*) 'more MPI tasks than are possible for your'
+     write(error_units,*) 'problem size (matrix size and blocksize)!'
+     write(error_units,*) 'The blacsgrid can not be set up properly'
+     write(error_units,*) 'Try reducing the number of MPI tasks...'
+     call MPI_ABORT(mpi_comm_world, 1, mpierr)
+   endif
+
+   if (myid==0) then
+     print '(a)','| Past scalapack descriptor setup.'
+   end if
+
+   allocate(a (na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+
+   allocate(ev(na))
+
+   ! we want different random numbers on every process
+   ! (otherwise A might get rank deficient):
+
+   iseed(:) = myid
+   call RANDOM_SEED(put=iseed)
+   call RANDOM_NUMBER(z)
+
+   a(:,:) = z(:,:)
+
+   if (myid == 0) then
+     print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
+   endif
+   call pdtran(na, na, 1.d0, z, 1, 1, sc_desc, 1.d0, a, 1, 1, sc_desc) ! A = A + Z**T
+
+   !-------------------------------------------------------------------------------
+
+   if (elpa_init(20171201) /= elpa_ok) then
+     print *, "ELPA API version not supported"
+     stop
+   endif
+   e => elpa_allocate()
+
+   ! set parameters decribing the matrix and it's MPI distribution
+   call e%set("na", na, success)
+   call e%set("nev", nev, success)
+   call e%set("local_nrows", na_rows, success)
+   call e%set("local_ncols", na_cols, success)
+   call e%set("nblk", nblk, success)
+   call e%set("mpi_comm_parent", mpi_comm_world, success)
+   call e%set("process_row", my_prow, success)
+   call e%set("process_col", my_pcol, success)
+
+   success = e%setup()
+
+   call e%set("solver", elpa_solver_1stage, success)
+
+
+   ! Calculate eigenvalues/eigenvectors
+
+   if (myid==0) then
+     print '(a)','| Entering one-step ELPA solver ... '
+     print *
+   end if
+
+   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
+   call e%eigenvectors(a, ev, z, success)
+
+   if (myid==0) then
+     print '(a)','| One-step ELPA solver complete.'
+     print *
+   end if
+
+   call elpa_deallocate(e)
+   call elpa_uninit()
+
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+
+end
+
diff -Nru elpa-2016.05.001/test_project_2stage/autogen.sh elpa-2019.11.001/test_project_2stage/autogen.sh
--- elpa-2016.05.001/test_project_2stage/autogen.sh	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/autogen.sh	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+
+mkdir -p m4/
+
+test -n "$srcdir" || srcdir=`dirname "$0"`
+test -n "$srcdir" || srcdir=.
+
+autoreconf --force --install --verbose "$srcdir"
diff -Nru elpa-2016.05.001/test_project_2stage/configure.ac elpa-2019.11.001/test_project_2stage/configure.ac
--- elpa-2016.05.001/test_project_2stage/configure.ac	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/configure.ac	2019-12-20 05:57:47.000000000 +0000
@@ -0,0 +1,94 @@
+AC_PREREQ([2.69])
+AC_INIT([elpa_test_project],[2019.11.001], elpa-library@rzg.mpg.de)
+elpaversion="2019.11.001"
+AC_CONFIG_SRCDIR([src/test_real2.F90])
+
+AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
+
+# Without this, automake tries to be smart and rebuilt
+# the autoconf generated files such as configure, aclocal.m4, etc.,
+# in case the timestamps of files such as configure.ac are newer
+#
+# This only makes trouble for end users with out-of-date autoconf versions
+# that cannot produce these files
+AM_MAINTAINER_MODE([disable])
+
+AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_HEADERS([config.h])
+AM_SILENT_RULES([yes])
+
+rm -rf config.h config-f90.h
+
+AX_CHECK_GNU_MAKE()
+if test x$_cv_gnu_make_command = x ; then
+        AC_MSG_ERROR([Need GNU Make])
+fi
+
+AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
+if test x"${CPP_FOUND}" = xno; then
+  AC_MSG_ERROR([no cpp found])
+fi
+
+# gnu-make fortran module dependencies
+m4_include([fdep/fortran_dependencies.m4])
+FDEP_F90_GNU_MAKE_DEPS
+
+AC_PROG_INSTALL
+AM_PROG_CC_C_O
+AM_PROG_AR
+AM_PROG_AS
+
+AC_LANG([Fortran])
+m4_include([m4/ax_prog_fc_mpi.m4])
+
+dnl check whether an mpi compiler is available;
+dnl if not abort since it is mandatory
+AX_PROG_FC_MPI([],[have_mpi=yes],[have_mpi=no
+		 if test x"${have_mpi}" = xno; then
+  		  AC_MSG_ERROR([no mpi found])
+		  fi])
+
+AC_FC_FREEFORM
+AC_FC_MODULE_FLAG
+AC_FC_MODULE_OUTPUT_FLAG
+
+AC_MSG_CHECKING(whether OpenMP usage is specified)
+AC_ARG_WITH([openmp],
+		AS_HELP_STRING([--with-openmp],
+			       [use OpenMP threading, default no.]),
+	      [with_openmp=yes],
+	      [with_openmp=no])
+  AC_MSG_RESULT([${with_openmp}])
+  if test x"${enable_openmp}" = x"yes"; then
+     with_openmp=yes
+     AC_MSG_CHECKING(whether --enable-openmp is specified)
+     AC_MSG_RESULT([${enable_openmp}])
+  fi
+  AM_CONDITIONAL([WITH_OPENMP],[test x"$with_openmp" = x"yes"])
+  if test x"${with_openmp}" = x"yes"; then
+	AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
+        AX_ELPA_OPENMP
+	elpa="elpa_openmp-$elpaversion"
+  else
+	elpa="elpa-$elpaversion"
+  fi
+
+# Here comes the ELPA specific part
+PKG_PROG_PKG_CONFIG
+PKG_CHECK_MODULES([ELPA],[${elpa}],[],[AC_MSG_ERROR(["Need ${elpa}"])])
+PKG_CHECK_VAR([ELPA_FCFLAGS],[${elpa}],[fcflags])
+
+LT_INIT
+
+AC_SUBST([FC_MODINC])
+AC_SUBST([FC_MODOUT])
+
+rm -rf modules/ .fortran_dependencies/
+mkdir modules
+
+AC_CONFIG_FILES([
+  Makefile
+])
+AC_OUTPUT
+
+grep "^#define" config.h > config-f90.h
diff -Nru elpa-2016.05.001/test_project_2stage/fdep/fortran_dependencies.m4 elpa-2019.11.001/test_project_2stage/fdep/fortran_dependencies.m4
--- elpa-2016.05.001/test_project_2stage/fdep/fortran_dependencies.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/fdep/fortran_dependencies.m4	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,19 @@
+AC_DEFUN([FDEP_F90_GNU_MAKE_DEPS],[
+AC_MSG_CHECKING([for GNU make])
+for a in "$MAKE" make gmake gnumake ; do
+        if test -z "$a" ; then continue ; fi ;
+        if  ( sh -c "$a --version" 2> /dev/null | grep GNU  2>&1 > /dev/null ) ;  then
+                _fdep_gnu_make_command=$a ;
+                break;
+        fi
+done ;
+AC_MSG_RESULT([$_fdep_gnu_make_command])
+if test x$_fdep_gnu_make_command = x ; then
+	AC_MSG_ERROR([Need GNU Make])
+fi
+AC_SUBST([FORTRAN_MODULE_DEPS], ["
+CLEANFILES +=
+include ${srcdir}/fdep/fortran_dependencies.mk
+"])
+AM_SUBST_NOTMAKE([FORTRAN_MODULE_DEPS])
+])
diff -Nru elpa-2016.05.001/test_project_2stage/fdep/fortran_dependencies.mk elpa-2019.11.001/test_project_2stage/fdep/fortran_dependencies.mk
--- elpa-2016.05.001/test_project_2stage/fdep/fortran_dependencies.mk	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/fdep/fortran_dependencies.mk	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,80 @@
+_f90_verbose = $(_f90_verbose_$(V))
+_f90_verbose_ = $(_f90_verbose_$(AM_DEFAULT_VERBOSITY))
+_f90_verbose_0 = @echo "  $1";
+_f90_targets = $(subst -,_,$(patsubst %.la,%_la,$(patsubst %.a,%_a,$(patsubst %.so,%_so,$(PROGRAMS) $(LTLIBRARIES)))))
+FORTRAN_CPP ?= cpp -P -traditional -Wall -Werror
+
+# $1 source files
+#
+# returns: file without any .F90 .f90 .F .f extension
+define strip_fortran_ext
+$(patsubst %.F90,%,$(patsubst %.f90,%,$(patsubst %.F,%,$(patsubst %.f,%,$1))))
+endef
+
+# $1 program
+#
+# returns:
+#  '1' if object files for target $1 are prefixed due to 'per-target' flags,
+#  '' (the empty string) otherwise. See the automake manual for 'per-target'
+#  compilation
+#
+define is_per_target
+$(if $(filter $(call strip_fortran_ext,$(firstword $(call fortran_sources,$1))),$(patsubst %.o,%,$(patsubst %.lo,%,$($1_OBJECTS)))),,1)
+endef
+
+# $1 top-level target name (i.e. an entry of _f90_targets)
+#
+# returns: all target source files matching *.F90 *.f90 *.F *.f
+define fortran_sources
+$(filter %.F90 %.f90 %.F %.f,$($1_SOURCES))
+endef
+
+# $1 top-level target name
+#
+# returns: the appropriate extension (i.e. 'o' for normal programs, '.lo' for libraries)
+define object_extension
+$(if $(filter $1,$(PROGRAMS)),o,lo)
+endef
+
+# $1 source_file
+# $2 stem
+# $3 program
+define module_targets
+$(eval _$3_use_mods += $(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).use_mods.$3.$(call object_extension,$3))
+$(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).use_mods.$3.$(call object_extension,$3): $1 $(dir $1)$(am__dirstamp)
+	$(call _f90_verbose,F90 USE  [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | grep -i -o '^ *use [^ ,!:]*' | sort -u > $$@
+
+$(eval _$3_def_mods += $(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).def_mods.$3.$(call object_extension,$3))
+$(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).def_mods.$3.$(call object_extension,$3): $1 $(dir $1)$(am__dirstamp)
+	$(call _f90_verbose,F90 MOD  [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | grep -i -o '^ *module [^!]*' | grep -v "\<procedure\>" > $$@ || true
+
+endef
+$(foreach p,$(_f90_targets),$(if $(call is_per_target,$p),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,$p-,$p))),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,,$p)))))
+
+_f90_depdir=$(abs_builddir)/.fortran_dependencies
+_f90_depfile = $(_f90_depdir)/dependencies.mk
+
+define is_clean
+$(if $(filter-out mostlyclean clean distclean maintainer-clean,$(MAKECMDGOALS)),0,1)
+endef
+
+define _fdep_newline
+
+
+endef
+
+ifneq ($(call is_clean),1)
+include $(_f90_depfile)
+endif
+$(_f90_depfile): $(top_srcdir)/fdep/fortran_dependencies.pl $(foreach p,$(_f90_targets),$(_$p_use_mods) $(_$p_def_mods)) | $(foreach p,$(_f90_targets),$(_f90_depdir)/$p)
+	$(call _f90_verbose,F90 DEPS $@)echo > $@; $(foreach p,$(_f90_targets),$(top_srcdir)/fdep/fortran_dependencies.pl $p $(_$p_use_mods) $(_$p_def_mods) >> $@ || { rm $@; exit 1; } ;$(_fdep_newline))
+
+$(_f90_depdir):
+	@mkdir $@
+
+$(foreach p,$(_f90_targets),$(_f90_depdir)/$p): | $(_f90_depdir)
+	@mkdir $@
+
+CLEANFILES += $(foreach p,$(_f90_targets),$(_$p_def_mods) $(_$p_use_mods))
+CLEANFILES += $(foreach p,$(_f90_targets),$(_f90_depdir)/$p/*)
+CLEANFILES += $(_f90_depfile)
diff -Nru elpa-2016.05.001/test_project_2stage/fdep/fortran_dependencies.pl elpa-2019.11.001/test_project_2stage/fdep/fortran_dependencies.pl
--- elpa-2016.05.001/test_project_2stage/fdep/fortran_dependencies.pl	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/fdep/fortran_dependencies.pl	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,77 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my %defs = ();
+my %uses = ();
+
+my $use_re = qr/^\s*use\s+(\S+)\s*$/;
+my $def_re = qr/^\s*module\s+(\S+)\s*$/;
+
+sub add_use {
+	my ($file, $module) = @_;
+	if (defined($defs{$module}) && $defs{$module} eq $file) {
+		# do not add self-dependencies
+		return;
+	}
+	if (!defined($uses{$file})) {
+		$uses{$file} = { $module => 1 };
+	} else {
+		$uses{$file}{$module} = 1;
+	}
+}
+
+sub add_def {
+	my ($file, $module) = @_;
+	if (!defined($defs{$module})) {
+		$defs{$module} = $file;
+		if (defined($uses{$file}) && defined($uses{$file}{$module})) {
+			delete $uses{$file}{$module};
+		}
+	} else {
+		die "Module $module both defined in $file, $defs{$module}";
+	}
+}
+
+my $p = shift;
+
+foreach my $file (@ARGV) {
+	my $re;
+	my $add;
+	my $object;
+	if (defined($ENV{V}) && $ENV{V} ge "2") {
+		print STDERR "fdep: Considering file $file\n";
+	}
+	if ($file =~ /^(.*)\.def_mods.$p(\..*)$/) {
+		$re = $def_re;
+		$add = \&add_def;
+		$object = $1 . $2;
+	} elsif ($file =~ /^(.*)\.use_mods.$p(\..*)$/) {
+		$re = $use_re;
+		$add = \&add_use;
+		$object = $1 . $2;
+	} else {
+		die "Unrecognized file extension for '$file'\nExpected (.*)\.def_mods.$p(\..*) or (.*)\.use_mods.$p(\..*)";
+	}
+	open(FILE,"<",$file) || die "\nCan't open $file: $!\n\n";
+	while(<FILE>) {
+		chomp;
+		$_ = lc($_);
+		if ($_ =~ $re) {
+			&$add($object, $1);
+		} else {
+			die "Cannot parse module statement '$_', was expecting $re";
+		}
+	}
+	close(FILE)
+}
+
+foreach my $object (sort keys %uses) {
+	for my $m (keys %{$uses{$object}}) {
+		if (defined $defs{$m}) {
+			print "$object: ", $defs{$m}, "\n";
+		} elsif (defined($ENV{V}) && $ENV{V} ge "1") {
+			print STDERR "fdep: Warning: Cannot find definition of module $m in files for program $p, might be external\n";
+		}
+	}
+}
diff -Nru elpa-2016.05.001/test_project_2stage/fdep/LICENSE elpa-2019.11.001/test_project_2stage/fdep/LICENSE
--- elpa-2016.05.001/test_project_2stage/fdep/LICENSE	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/fdep/LICENSE	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,19 @@
+Copyright (c) 2013 Lorenz Hüdepohl
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff -Nru elpa-2016.05.001/test_project_2stage/fdep/README elpa-2019.11.001/test_project_2stage/fdep/README
--- elpa-2016.05.001/test_project_2stage/fdep/README	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/fdep/README	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,99 @@
+fdep
+----
+
+fdep is a small set of scripts to teach autoconf/automake (using GNU make)
+about the additional dependencies in Fortran 90 files due to modules.
+
+With this, Fortran files can be listed in any order in Makefile.am and parallel
+builds work.
+
+
+Usage
+-----
+
+  Put this project as a directory "fdep" in your source code, place the two
+  lines
+
+    m4_include([fdep/fortran_dependencies.m4])
+    FDEP_F90_GNU_MAKE_DEPS
+
+  in your configure.ac, and add a single line
+
+    @FORTRAN_MODULE_DEPS@
+
+  in your Makefile.am. All .F90 files of all programs in bin_PROGRAMS and all
+  libraries in lib_LTLIBRARIES will now be scanned for modules and the
+  resulting dependencies will be honoured.
+
+
+What is the problem with Fortran 90 modules and make dependencies?
+------------------------------------------------------------------
+
+  In Fortran 90 source files one can define any number of "modules", containing
+  variable and function definitions. The names of the modules defined in a file
+  can be arbitrary.
+
+  In another source file these modules can be used, informing the Fortran
+  compiler about the definitions in these modules (e.g. to do type-checking).
+  This creates a problem, as the compiler has to know somehow where the module
+  is defined.
+
+  The usual solution employed by almost every Fortran compiler is to create
+  special "module" files for each module contained in a source file during
+  compilation. Their file name is derived by a compiler-specific recipe of the
+  modules identifier (usually the lower-cased module's identifier plus ".mod",
+  so "foo_module.mod" and "some_other_module.mod"). When the compiler
+  encounters a "use" statement during the compilation of another file, it
+  confers to this file to import the definitions of the module.
+
+  That means, you cannot compile files using modules defined in yet un-compiled
+  files, one has to tell make about this dependency.
+
+  (A primitive solution to this problem is listing the file in a pre-sorted
+   order, so that files defining modules are compiled first.
+
+   However, that way the dependency-graph make knows about is incomplete and
+   parallel builds will fail with a high probability)
+
+
+How does fdep solve this problem technically?
+---------------------------------------------
+
+  As the name of the module files can be an arbitrary (and some compilers might
+  even save the module definitions in some completely different way), fdep
+  tells make about the module dependencies as a relation directly between
+  object files, e.g. when a file 'b.f90' is using any module of file 'a.f90',
+  fdep adds a dependency of
+
+    b.o: a.o
+
+
+  More specifically, the perl-script fortran_dependencies.pl is run by make to
+  create a file .fortran_dependencies/dependencies.mk, which is then included.
+  To do this, first every source file (for every defined program and library)
+  is scanned for lines with "module" or "use" statements. These are saved in
+  two additional files (.use_mods and .def_mods) per source file and contain
+  lists of defined and required modules. The perl script then reads these in
+  and produces the appropriate rules.
+
+
+Drawbacks
+---------
+
+  GNU make is required. The detailed dependency graph due to "module" and "use"
+  statements is only available after pre-processing, when autoconf and even
+  configure is long over. To still get proper dependencies, fdep uses GNU
+  make's feature to include generated sub-Makefiles during a running make
+  invocation.
+
+
+License
+-------
+
+  fdep is released under the MIT License. See the LICENSE file for details.
+
+
+Contributing
+------------
+
+  Send your patches or pull-request to dev@stellardeath.org
diff -Nru elpa-2016.05.001/test_project_2stage/m4/ax_prog_fc_mpi.m4 elpa-2019.11.001/test_project_2stage/m4/ax_prog_fc_mpi.m4
--- elpa-2016.05.001/test_project_2stage/m4/ax_prog_fc_mpi.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/m4/ax_prog_fc_mpi.m4	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,162 @@
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_prog_fc_mpi.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_FC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]])
+#
+# DESCRIPTION
+#
+#   This macro tries to find out how to compile Fortran77 programs that use
+#   MPI (Message Passing Interface), a standard API for parallel process
+#   communication (see http://www-unix.mcs.anl.gov/mpi/).  The macro has to
+#   be used instead of the standard macro AC_PROG_FC and will replace the
+#   standard variable FC with the found compiler.
+#
+#   MPI-WANTED-TEST is used to test whether MPI is actually wanted by the
+#   user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will
+#   try to find out how to use MPI, if it fails, the macro will call
+#   AC_PROG_CC to find a standard C compiler instead.
+#
+#   When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found
+#   (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If
+#   ACTION-IF-FOUND is not set, the macro will define HAVE_MPI.
+#
+#   The following example demonstrates usage of the macro:
+#
+#     # If --with-mpi=auto is used, try to find MPI, but use standard FC compiler if it is not found.
+#     # If --with-mpi=yes is used, try to find MPI and fail if it isn't found.
+#     # If --with-mpi=no is used, use a standard FC compiler instead.
+#     AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi],
+#         [compile with MPI (parallelization) support. If none is found,
+#         MPI is not used. Default: auto])
+#     ],,[with_mpi=auto])
+#
+#     AX_PROG_FC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[
+#       use_mpi=no
+#       if test x"$with_mpi" = xyes; then
+#         AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.])
+#       else
+#         AC_MSG_WARN([No MPI compiler found, won't use MPI.])
+#       fi
+#     ])
+#
+# LICENSE
+#
+#   Copyright (c) 2010,2011 Olaf Lenz <olenz@icp.uni-stuttgart.de>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 2
+
+AC_DEFUN([AX_PROG_FC_MPI], [
+AC_PREREQ(2.50)
+
+# Check for compiler
+# Needs to be split off into an extra macro to ensure right expansion
+# order.
+AC_REQUIRE([_AX_PROG_FC_MPI],[_AX_PROG_FC_MPI([$1])])
+
+AS_IF([test x"$_ax_prog_fc_mpi_mpi_wanted" = xno],
+  [ _ax_prog_fc_mpi_mpi_found=no ],
+  [
+    AC_LANG_PUSH([Fortran])
+
+    # test whether MPI_INIT is available
+    # We do not use AC_SEARCH_LIBS here, as it caches its outcome and
+    # thus disallows corresponding calls in the other AX_PROG_*_MPI
+    # macros.
+    for lib in NONE mpichf90 fmpi fmpich; do
+      save_LIBS=$LIBS
+      if test x"$lib" = xNONE; then
+        AC_MSG_CHECKING([for function MPI_INIT])
+      else
+        AC_MSG_CHECKING([for function MPI_INIT in -l$lib])
+        LIBS="-l$lib $LIBS"
+      fi
+      AC_LINK_IFELSE([AC_LANG_CALL([],[MPI_INIT])],
+        [ _ax_prog_fc_mpi_mpi_found=yes ],
+        [ _ax_prog_fc_mpi_mpi_found=no ])
+      AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_found)
+      if test "x$_ax_prog_fc_mpi_mpi_found" = "xyes"; then
+        break;
+      fi
+      LIBS=$save_LIBS
+    done
+
+    # Check for header
+    AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [
+      AC_MSG_CHECKING([for mpif.h])
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[[
+      include 'mpif.h'
+]])],
+        [ AC_MSG_RESULT(yes)],
+        [ AC_MSG_RESULT(no)
+	  _ax_prog_fc_mpi_mpi_found=no
+      ])
+    ])
+    AC_LANG_POP([Fortran])
+])
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [
+        ifelse([$2],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$2])
+        :
+],[
+        $3
+        :
+])
+
+])dnl AX_PROG_FC_MPI
+
+dnl _AX_PROG_FC_MPI is an internal macro required by AX_PROG_FC_MPI.
+dnl To ensure the right expansion order, the main function AX_PROG_FC_MPI
+dnl has to be split into two parts. This part looks for the MPI
+dnl compiler, while the other one tests whether an MPI program can be
+dnl compiled.
+dnl
+AC_DEFUN([_AX_PROG_FC_MPI], [
+  AC_ARG_VAR(MPIFC,[MPI Fortran compiler command])
+  ifelse([$1],,[_ax_prog_fc_mpi_mpi_wanted=yes],[
+    AC_MSG_CHECKING([whether to compile using MPI])
+    if $1; then
+      _ax_prog_fc_mpi_mpi_wanted=yes
+    else
+      _ax_prog_fc_mpi_mpi_wanted=no
+    fi
+    AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_wanted)
+  ])
+  if test x"$_ax_prog_fc_mpi_mpi_wanted" = xyes; then
+    if test -z "$FC" && test -n "$MPIFC"; then
+      FC="$MPIFC"
+    else
+      AC_CHECK_TOOLS([FC], [mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77])
+    fi
+  fi
+  AC_PROG_FC
+])dnl _AX_PROG_FC_MPI
diff -Nru elpa-2016.05.001/test_project_2stage/Makefile.am elpa-2019.11.001/test_project_2stage/Makefile.am
--- elpa-2016.05.001/test_project_2stage/Makefile.am	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/Makefile.am	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,13 @@
+## Process this file with automake to produce Makefile.in
+
+ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
+
+AM_FCFLAGS = @FC_MODINC@modules $(ELPA_FCFLAGS)
+AM_LDFLAGS = $(ELPA_LIBS)
+
+#bindir = $(abs_top_builddir)
+bin_PROGRAMS = test_real2
+test_real2_SOURCES = src/test_real2.F90
+
+distclean-local:
+	-rm config-f90.h
diff -Nru elpa-2016.05.001/test_project_2stage/src/test_real2.F90 elpa-2019.11.001/test_project_2stage/src/test_real2.F90
--- elpa-2016.05.001/test_project_2stage/src/test_real2.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_2stage/src/test_real2.F90	2019-12-19 09:47:41.000000000 +0000
@@ -0,0 +1,237 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+!>
+!> Fortran test programm to demonstrates the use of
+!> ELPA 2 real case library.
+!> If "HAVE_REDIRECT" was defined at build time
+!> the stdout and stderr output of each MPI task
+!> can be redirected to files if the environment
+!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
+!> to "true".
+!>
+!> By calling executable [arg1] [arg2] [arg3] [arg4]
+!> one can define the size (arg1), the number of
+!> Eigenvectors to compute (arg2), and the blocking (arg3).
+!> If these values are not set default values (4000, 1500, 16)
+!> are choosen.
+!> If these values are set the 4th argument can be
+!> "output", which specifies that the EV's are written to
+!> an ascii file.
+!>
+program test_real_example
+
+!-------------------------------------------------------------------------------
+! Standard eigenvalue problem - REAL version
+!
+! This program demonstrates the use of the ELPA module
+! together with standard scalapack routines
+!
+! Copyright of the original code rests with the authors inside the ELPA
+! consortium. The copyright of any additional modifications shall rest
+! with their original authors, but shall adhere to the licensing terms
+! distributed along with the original code in the file "COPYING".
+!
+!-------------------------------------------------------------------------------
+
+   use iso_c_binding
+
+   use elpa
+
+#ifdef HAVE_MPI_MODULE
+   use mpi
+   implicit none
+#else
+   implicit none
+   include 'mpif.h'
+#endif
+
+   !-------------------------------------------------------------------------------
+   ! Please set system size parameters below!
+   ! na:   System size
+   ! nev:  Number of eigenvectors to be calculated
+   ! nblk: Blocking factor in block cyclic distribution
+   !-------------------------------------------------------------------------------
+
+   integer           :: nblk
+   integer                          :: na, nev
+
+   integer                          :: np_rows, np_cols, na_rows, na_cols
+
+   integer                          :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
+   integer                          :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
+
+   integer, external                :: numroc
+
+   real(kind=c_double), allocatable :: a(:,:), z(:,:), ev(:)
+
+   integer                          :: iseed(4096) ! Random seed, size should be sufficient for every generator
+
+   integer                          :: STATUS
+   integer                          :: success
+   character(len=8)                 :: task_suffix
+   integer                          :: j
+
+   integer, parameter               :: error_units = 0
+
+   class(elpa_t), pointer           :: e
+   !-------------------------------------------------------------------------------
+
+
+   ! default parameters
+   na = 1000
+   nev = 500
+   nblk = 16
+
+   call mpi_init(mpierr)
+   call mpi_comm_rank(mpi_comm_world,myid,mpierr)
+   call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
+
+   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
+     if(mod(nprocs,np_cols) == 0 ) exit
+   enddo
+   ! at the end of the above loop, nprocs is always divisible by np_cols
+
+   np_rows = nprocs/np_cols
+
+   ! initialise BLACS
+   my_blacs_ctxt = mpi_comm_world
+   call BLACS_Gridinit(my_blacs_ctxt, 'C', np_rows, np_cols)
+   call BLACS_Gridinfo(my_blacs_ctxt, nprow, npcol, my_prow, my_pcol)
+
+   if (myid==0) then
+     print '(a)','| Past BLACS_Gridinfo.'
+   end if
+   ! determine the neccessary size of the distributed matrices,
+   ! we use the scalapack tools routine NUMROC
+
+   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
+   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
+
+
+   ! set up the scalapack descriptor for the checks below
+   ! For ELPA the following restrictions hold:
+   ! - block sizes in both directions must be identical (args 4 a. 5)
+   ! - first row and column of the distributed matrix must be on
+   !   row/col 0/0 (arg 6 and 7)
+
+   call descinit(sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info)
+
+   if (info .ne. 0) then
+     write(error_units,*) 'Error in BLACS descinit! info=',info
+     write(error_units,*) 'Most likely this happend since you want to use'
+     write(error_units,*) 'more MPI tasks than are possible for your'
+     write(error_units,*) 'problem size (matrix size and blocksize)!'
+     write(error_units,*) 'The blacsgrid can not be set up properly'
+     write(error_units,*) 'Try reducing the number of MPI tasks...'
+     call MPI_ABORT(mpi_comm_world, 1, mpierr)
+   endif
+
+   if (myid==0) then
+     print '(a)','| Past scalapack descriptor setup.'
+   end if
+
+   allocate(a (na_rows,na_cols))
+   allocate(z (na_rows,na_cols))
+
+   allocate(ev(na))
+
+   ! we want different random numbers on every process
+   ! (otherwise A might get rank deficient):
+
+   iseed(:) = myid
+   call RANDOM_SEED(put=iseed)
+   call RANDOM_NUMBER(z)
+
+   a(:,:) = z(:,:)
+
+   if (myid == 0) then
+     print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
+   endif
+   call pdtran(na, na, 1.d0, z, 1, 1, sc_desc, 1.d0, a, 1, 1, sc_desc) ! A = A + Z**T
+
+   !-------------------------------------------------------------------------------
+
+   if (elpa_init(20171201) /= elpa_ok) then
+     print *, "ELPA API version not supported"
+     stop
+   endif
+   e => elpa_allocate()
+
+   ! set parameters decribing the matrix and it's MPI distribution
+   call e%set("na", na, success)
+   call e%set("nev", nev, success)
+   call e%set("local_nrows", na_rows, success)
+   call e%set("local_ncols", na_cols, success)
+   call e%set("nblk", nblk, success)
+   call e%set("mpi_comm_parent", mpi_comm_world, success)
+   call e%set("process_row", my_prow, success)
+   call e%set("process_col", my_pcol, success)
+
+   success = e%setup()
+
+   call e%set("solver", elpa_solver_2stage, success)
+
+
+   ! Calculate eigenvalues/eigenvectors
+
+   if (myid==0) then
+     print '(a)','| Entering two-step ELPA solver ... '
+     print *
+   end if
+
+   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
+   call e%eigenvectors(a, ev, z, success)
+
+   if (myid==0) then
+     print '(a)','| Two-step ELPA solver complete.'
+     print *
+   end if
+
+   call elpa_deallocate(e)
+   call elpa_uninit()
+
+   call blacs_gridexit(my_blacs_ctxt)
+   call mpi_finalize(mpierr)
+
+end
+
diff -Nru elpa-2016.05.001/test_project_C/autogen.sh elpa-2019.11.001/test_project_C/autogen.sh
--- elpa-2016.05.001/test_project_C/autogen.sh	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/autogen.sh	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+
+mkdir -p m4/
+
+test -n "$srcdir" || srcdir=`dirname "$0"`
+test -n "$srcdir" || srcdir=.
+
+autoreconf --force --install --verbose "$srcdir"
diff -Nru elpa-2016.05.001/test_project_C/configure.ac elpa-2019.11.001/test_project_C/configure.ac
--- elpa-2016.05.001/test_project_C/configure.ac	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/configure.ac	2019-12-20 05:57:47.000000000 +0000
@@ -0,0 +1,94 @@
+AC_PREREQ([2.69])
+AC_INIT([elpa_test_project],[2019.11.001], elpa-library@rzg.mpg.de)
+elpaversion="2019.11.001"
+AC_CONFIG_SRCDIR([src/test_real.c])
+
+AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
+
+# Without this, automake tries to be smart and rebuilt
+# the autoconf generated files such as configure, aclocal.m4, etc.,
+# in case the timestamps of files such as configure.ac are newer
+#
+# This only makes trouble for end users with out-of-date autoconf versions
+# that cannot produce these files
+AM_MAINTAINER_MODE([disable])
+
+AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_HEADERS([config.h])
+AM_SILENT_RULES([yes])
+
+rm -rf config.h config-f90.h
+
+AX_CHECK_GNU_MAKE()
+if test x$_cv_gnu_make_command = x ; then
+        AC_MSG_ERROR([Need GNU Make])
+fi
+
+AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
+if test x"${CPP_FOUND}" = xno; then
+  AC_MSG_ERROR([no cpp found])
+fi
+
+# gnu-make fortran module dependencies
+m4_include([fdep/fortran_dependencies.m4])
+FDEP_F90_GNU_MAKE_DEPS
+
+AC_PROG_INSTALL
+AM_PROG_CC_C_O
+AM_PROG_AR
+AM_PROG_AS
+
+AC_LANG([Fortran])
+m4_include([m4/ax_prog_fc_mpi.m4])
+
+dnl check whether an mpi compiler is available;
+dnl if not abort since it is mandatory
+AX_PROG_FC_MPI([],[have_mpi=yes],[have_mpi=no
+		 if test x"${have_mpi}" = xno; then
+  		  AC_MSG_ERROR([no mpi found])
+		  fi])
+
+AC_FC_FREEFORM
+AC_FC_MODULE_FLAG
+AC_FC_MODULE_OUTPUT_FLAG
+
+AC_MSG_CHECKING(whether OpenMP usage is specified)
+AC_ARG_WITH([openmp],
+		AS_HELP_STRING([--with-openmp],
+			       [use OpenMP threading, default no.]),
+	      [with_openmp=yes],
+	      [with_openmp=no])
+  AC_MSG_RESULT([${with_openmp}])
+  if test x"${enable_openmp}" = x"yes"; then
+     with_openmp=yes
+     AC_MSG_CHECKING(whether --enable-openmp is specified)
+     AC_MSG_RESULT([${enable_openmp}])
+  fi
+  AM_CONDITIONAL([WITH_OPENMP],[test x"$with_openmp" = x"yes"])
+  if test x"${with_openmp}" = x"yes"; then
+	AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
+        AX_ELPA_OPENMP
+	elpa="elpa_openmp-$elpaversion"
+  else
+	elpa="elpa-$elpaversion"
+  fi
+
+# Here comes the ELPA specific part
+PKG_PROG_PKG_CONFIG
+PKG_CHECK_MODULES([ELPA],[${elpa}],[],[AC_MSG_ERROR(["Need ${elpa}"])])
+PKG_CHECK_VAR([ELPA_FCFLAGS],[${elpa}],[fcflags])
+
+LT_INIT
+
+AC_SUBST([FC_MODINC])
+AC_SUBST([FC_MODOUT])
+
+rm -rf modules/ .fortran_dependencies/
+mkdir modules
+
+AC_CONFIG_FILES([
+  Makefile
+])
+AC_OUTPUT
+
+grep "^#define" config.h > config-f90.h
diff -Nru elpa-2016.05.001/test_project_C/fdep/fortran_dependencies.m4 elpa-2019.11.001/test_project_C/fdep/fortran_dependencies.m4
--- elpa-2016.05.001/test_project_C/fdep/fortran_dependencies.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/fdep/fortran_dependencies.m4	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,19 @@
+AC_DEFUN([FDEP_F90_GNU_MAKE_DEPS],[
+AC_MSG_CHECKING([for GNU make])
+for a in "$MAKE" make gmake gnumake ; do
+        if test -z "$a" ; then continue ; fi ;
+        if  ( sh -c "$a --version" 2> /dev/null | grep GNU  2>&1 > /dev/null ) ;  then
+                _fdep_gnu_make_command=$a ;
+                break;
+        fi
+done ;
+AC_MSG_RESULT([$_fdep_gnu_make_command])
+if test x$_fdep_gnu_make_command = x ; then
+	AC_MSG_ERROR([Need GNU Make])
+fi
+AC_SUBST([FORTRAN_MODULE_DEPS], ["
+CLEANFILES +=
+include ${srcdir}/fdep/fortran_dependencies.mk
+"])
+AM_SUBST_NOTMAKE([FORTRAN_MODULE_DEPS])
+])
diff -Nru elpa-2016.05.001/test_project_C/fdep/fortran_dependencies.mk elpa-2019.11.001/test_project_C/fdep/fortran_dependencies.mk
--- elpa-2016.05.001/test_project_C/fdep/fortran_dependencies.mk	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/fdep/fortran_dependencies.mk	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,80 @@
+_f90_verbose = $(_f90_verbose_$(V))
+_f90_verbose_ = $(_f90_verbose_$(AM_DEFAULT_VERBOSITY))
+_f90_verbose_0 = @echo "  $1";
+_f90_targets = $(subst -,_,$(patsubst %.la,%_la,$(patsubst %.a,%_a,$(patsubst %.so,%_so,$(PROGRAMS) $(LTLIBRARIES)))))
+FORTRAN_CPP ?= cpp -P -traditional -Wall -Werror
+
+# $1 source files
+#
+# returns: file without any .F90 .f90 .F .f extension
+define strip_fortran_ext
+$(patsubst %.F90,%,$(patsubst %.f90,%,$(patsubst %.F,%,$(patsubst %.f,%,$1))))
+endef
+
+# $1 program
+#
+# returns:
+#  '1' if object files for target $1 are prefixed due to 'per-target' flags,
+#  '' (the empty string) otherwise. See the automake manual for 'per-target'
+#  compilation
+#
+define is_per_target
+$(if $(filter $(call strip_fortran_ext,$(firstword $(call fortran_sources,$1))),$(patsubst %.o,%,$(patsubst %.lo,%,$($1_OBJECTS)))),,1)
+endef
+
+# $1 top-level target name (i.e. an entry of _f90_targets)
+#
+# returns: all target source files matching *.F90 *.f90 *.F *.f
+define fortran_sources
+$(filter %.F90 %.f90 %.F %.f,$($1_SOURCES))
+endef
+
+# $1 top-level target name
+#
+# returns: the appropriate extension (i.e. 'o' for normal programs, '.lo' for libraries)
+define object_extension
+$(if $(filter $1,$(PROGRAMS)),o,lo)
+endef
+
+# $1 source_file
+# $2 stem
+# $3 program
+define module_targets
+$(eval _$3_use_mods += $(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).use_mods.$3.$(call object_extension,$3))
+$(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).use_mods.$3.$(call object_extension,$3): $1 $(dir $1)$(am__dirstamp)
+	$(call _f90_verbose,F90 USE  [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | grep -i -o '^ *use [^ ,!:]*' | sort -u > $$@
+
+$(eval _$3_def_mods += $(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).def_mods.$3.$(call object_extension,$3))
+$(dir $1)$2$(call strip_fortran_ext,$(notdir $1)).def_mods.$3.$(call object_extension,$3): $1 $(dir $1)$(am__dirstamp)
+	$(call _f90_verbose,F90 MOD  [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | grep -i -o '^ *module [^!]*' | grep -v "\<procedure\>" > $$@ || true
+
+endef
+$(foreach p,$(_f90_targets),$(if $(call is_per_target,$p),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,$p-,$p))),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,,$p)))))
+
+_f90_depdir=$(abs_builddir)/.fortran_dependencies
+_f90_depfile = $(_f90_depdir)/dependencies.mk
+
+define is_clean
+$(if $(filter-out mostlyclean clean distclean maintainer-clean,$(MAKECMDGOALS)),0,1)
+endef
+
+define _fdep_newline
+
+
+endef
+
+ifneq ($(call is_clean),1)
+include $(_f90_depfile)
+endif
+$(_f90_depfile): $(top_srcdir)/fdep/fortran_dependencies.pl $(foreach p,$(_f90_targets),$(_$p_use_mods) $(_$p_def_mods)) | $(foreach p,$(_f90_targets),$(_f90_depdir)/$p)
+	$(call _f90_verbose,F90 DEPS $@)echo > $@; $(foreach p,$(_f90_targets),$(top_srcdir)/fdep/fortran_dependencies.pl $p $(_$p_use_mods) $(_$p_def_mods) >> $@ || { rm $@; exit 1; } ;$(_fdep_newline))
+
+$(_f90_depdir):
+	@mkdir $@
+
+$(foreach p,$(_f90_targets),$(_f90_depdir)/$p): | $(_f90_depdir)
+	@mkdir $@
+
+CLEANFILES += $(foreach p,$(_f90_targets),$(_$p_def_mods) $(_$p_use_mods))
+CLEANFILES += $(foreach p,$(_f90_targets),$(_f90_depdir)/$p/*)
+CLEANFILES += $(_f90_depfile)
diff -Nru elpa-2016.05.001/test_project_C/fdep/fortran_dependencies.pl elpa-2019.11.001/test_project_C/fdep/fortran_dependencies.pl
--- elpa-2016.05.001/test_project_C/fdep/fortran_dependencies.pl	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/fdep/fortran_dependencies.pl	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,77 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my %defs = ();
+my %uses = ();
+
+my $use_re = qr/^\s*use\s+(\S+)\s*$/;
+my $def_re = qr/^\s*module\s+(\S+)\s*$/;
+
+sub add_use {
+	my ($file, $module) = @_;
+	if (defined($defs{$module}) && $defs{$module} eq $file) {
+		# do not add self-dependencies
+		return;
+	}
+	if (!defined($uses{$file})) {
+		$uses{$file} = { $module => 1 };
+	} else {
+		$uses{$file}{$module} = 1;
+	}
+}
+
+sub add_def {
+	my ($file, $module) = @_;
+	if (!defined($defs{$module})) {
+		$defs{$module} = $file;
+		if (defined($uses{$file}) && defined($uses{$file}{$module})) {
+			delete $uses{$file}{$module};
+		}
+	} else {
+		die "Module $module both defined in $file, $defs{$module}";
+	}
+}
+
+my $p = shift;
+
+foreach my $file (@ARGV) {
+	my $re;
+	my $add;
+	my $object;
+	if (defined($ENV{V}) && $ENV{V} ge "2") {
+		print STDERR "fdep: Considering file $file\n";
+	}
+	if ($file =~ /^(.*)\.def_mods.$p(\..*)$/) {
+		$re = $def_re;
+		$add = \&add_def;
+		$object = $1 . $2;
+	} elsif ($file =~ /^(.*)\.use_mods.$p(\..*)$/) {
+		$re = $use_re;
+		$add = \&add_use;
+		$object = $1 . $2;
+	} else {
+		die "Unrecognized file extension for '$file'\nExpected (.*)\.def_mods.$p(\..*) or (.*)\.use_mods.$p(\..*)";
+	}
+	open(FILE,"<",$file) || die "\nCan't open $file: $!\n\n";
+	while(<FILE>) {
+		chomp;
+		$_ = lc($_);
+		if ($_ =~ $re) {
+			&$add($object, $1);
+		} else {
+			die "Cannot parse module statement '$_', was expecting $re";
+		}
+	}
+	close(FILE)
+}
+
+foreach my $object (sort keys %uses) {
+	for my $m (keys %{$uses{$object}}) {
+		if (defined $defs{$m}) {
+			print "$object: ", $defs{$m}, "\n";
+		} elsif (defined($ENV{V}) && $ENV{V} ge "1") {
+			print STDERR "fdep: Warning: Cannot find definition of module $m in files for program $p, might be external\n";
+		}
+	}
+}
diff -Nru elpa-2016.05.001/test_project_C/fdep/LICENSE elpa-2019.11.001/test_project_C/fdep/LICENSE
--- elpa-2016.05.001/test_project_C/fdep/LICENSE	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/fdep/LICENSE	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,19 @@
+Copyright (c) 2013 Lorenz Hüdepohl
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff -Nru elpa-2016.05.001/test_project_C/fdep/README elpa-2019.11.001/test_project_C/fdep/README
--- elpa-2016.05.001/test_project_C/fdep/README	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/fdep/README	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,99 @@
+fdep
+----
+
+fdep is a small set of scripts to teach autoconf/automake (using GNU make)
+about the additional dependencies in Fortran 90 files due to modules.
+
+With this, Fortran files can be listed in any order in Makefile.am and parallel
+builds work.
+
+
+Usage
+-----
+
+  Put this project as a directory "fdep" in your source code, place the two
+  lines
+
+    m4_include([fdep/fortran_dependencies.m4])
+    FDEP_F90_GNU_MAKE_DEPS
+
+  in your configure.ac, and add a single line
+
+    @FORTRAN_MODULE_DEPS@
+
+  in your Makefile.am. All .F90 files of all programs in bin_PROGRAMS and all
+  libraries in lib_LTLIBRARIES will now be scanned for modules and the
+  resulting dependencies will be honoured.
+
+
+What is the problem with Fortran 90 modules and make dependencies?
+------------------------------------------------------------------
+
+  In Fortran 90 source files one can define any number of "modules", containing
+  variable and function definitions. The names of the modules defined in a file
+  can be arbitrary.
+
+  In another source file these modules can be used, informing the Fortran
+  compiler about the definitions in these modules (e.g. to do type-checking).
+  This creates a problem, as the compiler has to know somehow where the module
+  is defined.
+
+  The usual solution employed by almost every Fortran compiler is to create
+  special "module" files for each module contained in a source file during
+  compilation. Their file name is derived by a compiler-specific recipe of the
+  modules identifier (usually the lower-cased module's identifier plus ".mod",
+  so "foo_module.mod" and "some_other_module.mod"). When the compiler
+  encounters a "use" statement during the compilation of another file, it
+  confers to this file to import the definitions of the module.
+
+  That means, you cannot compile files using modules defined in yet un-compiled
+  files, one has to tell make about this dependency.
+
+  (A primitive solution to this problem is listing the file in a pre-sorted
+   order, so that files defining modules are compiled first.
+
+   However, that way the dependency-graph make knows about is incomplete and
+   parallel builds will fail with a high probability)
+
+
+How does fdep solve this problem technically?
+---------------------------------------------
+
+  As the name of the module files can be an arbitrary (and some compilers might
+  even save the module definitions in some completely different way), fdep
+  tells make about the module dependencies as a relation directly between
+  object files, e.g. when a file 'b.f90' is using any module of file 'a.f90',
+  fdep adds a dependency of
+
+    b.o: a.o
+
+
+  More specifically, the perl-script fortran_dependencies.pl is run by make to
+  create a file .fortran_dependencies/dependencies.mk, which is then included.
+  To do this, first every source file (for every defined program and library)
+  is scanned for lines with "module" or "use" statements. These are saved in
+  two additional files (.use_mods and .def_mods) per source file and contain
+  lists of defined and required modules. The perl script then reads these in
+  and produces the appropriate rules.
+
+
+Drawbacks
+---------
+
+  GNU make is required. The detailed dependency graph due to "module" and "use"
+  statements is only available after pre-processing, when autoconf and even
+  configure is long over. To still get proper dependencies, fdep uses GNU
+  make's feature to include generated sub-Makefiles during a running make
+  invocation.
+
+
+License
+-------
+
+  fdep is released under the MIT License. See the LICENSE file for details.
+
+
+Contributing
+------------
+
+  Send your patches or pull-request to dev@stellardeath.org
diff -Nru elpa-2016.05.001/test_project_C/m4/ax_prog_fc_mpi.m4 elpa-2019.11.001/test_project_C/m4/ax_prog_fc_mpi.m4
--- elpa-2016.05.001/test_project_C/m4/ax_prog_fc_mpi.m4	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/m4/ax_prog_fc_mpi.m4	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,162 @@
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_prog_fc_mpi.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_FC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]])
+#
+# DESCRIPTION
+#
+#   This macro tries to find out how to compile Fortran77 programs that use
+#   MPI (Message Passing Interface), a standard API for parallel process
+#   communication (see http://www-unix.mcs.anl.gov/mpi/).  The macro has to
+#   be used instead of the standard macro AC_PROG_FC and will replace the
+#   standard variable FC with the found compiler.
+#
+#   MPI-WANTED-TEST is used to test whether MPI is actually wanted by the
+#   user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will
+#   try to find out how to use MPI, if it fails, the macro will call
+#   AC_PROG_CC to find a standard C compiler instead.
+#
+#   When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found
+#   (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If
+#   ACTION-IF-FOUND is not set, the macro will define HAVE_MPI.
+#
+#   The following example demonstrates usage of the macro:
+#
+#     # If --with-mpi=auto is used, try to find MPI, but use standard FC compiler if it is not found.
+#     # If --with-mpi=yes is used, try to find MPI and fail if it isn't found.
+#     # If --with-mpi=no is used, use a standard FC compiler instead.
+#     AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi],
+#         [compile with MPI (parallelization) support. If none is found,
+#         MPI is not used. Default: auto])
+#     ],,[with_mpi=auto])
+#
+#     AX_PROG_FC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[
+#       use_mpi=no
+#       if test x"$with_mpi" = xyes; then
+#         AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.])
+#       else
+#         AC_MSG_WARN([No MPI compiler found, won't use MPI.])
+#       fi
+#     ])
+#
+# LICENSE
+#
+#   Copyright (c) 2010,2011 Olaf Lenz <olenz@icp.uni-stuttgart.de>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 2
+
+AC_DEFUN([AX_PROG_FC_MPI], [
+AC_PREREQ(2.50)
+
+# Check for compiler
+# Needs to be split off into an extra macro to ensure right expansion
+# order.
+AC_REQUIRE([_AX_PROG_FC_MPI],[_AX_PROG_FC_MPI([$1])])
+
+AS_IF([test x"$_ax_prog_fc_mpi_mpi_wanted" = xno],
+  [ _ax_prog_fc_mpi_mpi_found=no ],
+  [
+    AC_LANG_PUSH([Fortran])
+
+    # test whether MPI_INIT is available
+    # We do not use AC_SEARCH_LIBS here, as it caches its outcome and
+    # thus disallows corresponding calls in the other AX_PROG_*_MPI
+    # macros.
+    for lib in NONE mpichf90 fmpi fmpich; do
+      save_LIBS=$LIBS
+      if test x"$lib" = xNONE; then
+        AC_MSG_CHECKING([for function MPI_INIT])
+      else
+        AC_MSG_CHECKING([for function MPI_INIT in -l$lib])
+        LIBS="-l$lib $LIBS"
+      fi
+      AC_LINK_IFELSE([AC_LANG_CALL([],[MPI_INIT])],
+        [ _ax_prog_fc_mpi_mpi_found=yes ],
+        [ _ax_prog_fc_mpi_mpi_found=no ])
+      AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_found)
+      if test "x$_ax_prog_fc_mpi_mpi_found" = "xyes"; then
+        break;
+      fi
+      LIBS=$save_LIBS
+    done
+
+    # Check for header
+    AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [
+      AC_MSG_CHECKING([for mpif.h])
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[[
+      include 'mpif.h'
+]])],
+        [ AC_MSG_RESULT(yes)],
+        [ AC_MSG_RESULT(no)
+	  _ax_prog_fc_mpi_mpi_found=no
+      ])
+    ])
+    AC_LANG_POP([Fortran])
+])
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [
+        ifelse([$2],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$2])
+        :
+],[
+        $3
+        :
+])
+
+])dnl AX_PROG_FC_MPI
+
+dnl _AX_PROG_FC_MPI is an internal macro required by AX_PROG_FC_MPI.
+dnl To ensure the right expansion order, the main function AX_PROG_FC_MPI
+dnl has to be split into two parts. This part looks for the MPI
+dnl compiler, while the other one tests whether an MPI program can be
+dnl compiled.
+dnl
+AC_DEFUN([_AX_PROG_FC_MPI], [
+  AC_ARG_VAR(MPIFC,[MPI Fortran compiler command])
+  ifelse([$1],,[_ax_prog_fc_mpi_mpi_wanted=yes],[
+    AC_MSG_CHECKING([whether to compile using MPI])
+    if $1; then
+      _ax_prog_fc_mpi_mpi_wanted=yes
+    else
+      _ax_prog_fc_mpi_mpi_wanted=no
+    fi
+    AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_wanted)
+  ])
+  if test x"$_ax_prog_fc_mpi_mpi_wanted" = xyes; then
+    if test -z "$FC" && test -n "$MPIFC"; then
+      FC="$MPIFC"
+    else
+      AC_CHECK_TOOLS([FC], [mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77])
+    fi
+  fi
+  AC_PROG_FC
+])dnl _AX_PROG_FC_MPI
diff -Nru elpa-2016.05.001/test_project_C/Makefile.am elpa-2019.11.001/test_project_C/Makefile.am
--- elpa-2016.05.001/test_project_C/Makefile.am	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/Makefile.am	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,14 @@
+## Process this file with automake to produce Makefile.in
+
+ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
+
+AM_FCFLAGS = @FC_MODINC@modules $(ELPA_FCFLAGS)
+AM_CFLAGS = $(ELPA_CFLAGS)
+AM_LDFLAGS = $(ELPA_LIBS)
+
+#bindir = $(abs_top_builddir)
+bin_PROGRAMS = test_real
+test_real_SOURCES = src/test_real.c src/test_blacs_infrastructure.F90
+
+distclean-local:
+	-rm config-f90.h
diff -Nru elpa-2016.05.001/test_project_C/src/test_blacs_infrastructure.F90 elpa-2019.11.001/test_project_C/src/test_blacs_infrastructure.F90
--- elpa-2016.05.001/test_project_C/src/test_blacs_infrastructure.F90	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/src/test_blacs_infrastructure.F90	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,172 @@
+!    This file is part of ELPA.
+!
+!    The ELPA library was originally created by the ELPA consortium,
+!    consisting of the following organizations:
+!
+!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
+!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+!      Informatik,
+!    - Technische Universität München, Lehrstuhl für Informatik mit
+!      Schwerpunkt Wissenschaftliches Rechnen ,
+!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+!      and
+!    - IBM Deutschland GmbH
+!
+!
+!    More information can be found here:
+!    http://elpa.mpcdf.mpg.de/
+!
+!    ELPA is free software: you can redistribute it and/or modify
+!    it under the terms of the version 3 of the license of the
+!    GNU Lesser General Public License as published by the Free
+!    Software Foundation.
+!
+!    ELPA is distributed in the hope that it will be useful,
+!    but WITHOUT ANY WARRANTY; without even the implied warranty of
+!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!    GNU Lesser General Public License for more details.
+!
+!    You should have received a copy of the GNU Lesser General Public License
+!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+!
+!    ELPA reflects a substantial effort on the part of the original
+!    ELPA consortium, and we ask you to respect the spirit of the
+!    license that we chose: i.e., please contribute any changes you
+!    may have back to the original ELPA library distribution, and keep
+!    any derivatives of ELPA under the same license that we chose for
+!    the original distribution, the GNU Lesser General Public License.
+!
+!
+#include "config-f90.h"
+#define WITH_MPI 1
+module test_blacs_infrastructure
+    use iso_c_binding
+    use mpi
+    integer, parameter :: ik  = C_INT32_T
+
+  contains
+
+    !c> void set_up_blacsgrid_f(int mpi_comm_parent, int np_rows, int np_cols, char layout,
+    !c>                         int* my_blacs_ctxt, int *my_prow, int *my_pcol);
+    subroutine set_up_blacsgrid(mpi_comm_parent, np_rows, np_cols, layout, &
+                                my_blacs_ctxt, my_prow, my_pcol) bind(C, name="set_up_blacsgrid_f")
+
+      !use test_util
+
+      implicit none
+      integer(kind=c_int), intent(in), value  :: mpi_comm_parent, np_rows, np_cols
+      character(len=1), intent(in), value     :: layout
+      integer(kind=c_int), intent(out)        :: my_blacs_ctxt, my_prow, my_pcol
+
+#ifdef WITH_MPI
+      integer :: np_rows_, np_cols_
+#endif
+
+      if (layout /= 'R' .and. layout /= 'C') then
+        print *, "layout must be 'R' or 'C'"
+        stop 1
+      end if
+
+      my_blacs_ctxt = mpi_comm_parent
+#ifdef WITH_MPI
+      call BLACS_Gridinit(my_blacs_ctxt, layout, np_rows, np_cols)
+      call BLACS_Gridinfo(my_blacs_ctxt, np_rows_, np_cols_, my_prow, my_pcol)
+      if (np_rows /= np_rows_) then
+        print *, "BLACS_Gridinfo returned different values for np_rows as set by BLACS_Gridinit"
+        stop 1
+      endif
+      if (np_cols /= np_cols_) then
+        print *, "BLACS_Gridinfo returned different values for np_cols as set by BLACS_Gridinit"
+        stop 1
+      endif
+#else
+      my_prow = 0
+      my_pcol = 0
+#endif
+    end subroutine
+
+    subroutine set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
+                                       np_rows, np_cols, na_rows,  &
+                                       na_cols, sc_desc, my_blacs_ctxt, info)
+
+      !use elpa_utilities, only : error_unit
+      !use test_util
+      implicit none
+
+      integer(kind=ik), intent(in)  :: na, nblk, my_prow, my_pcol, np_rows,   &
+                                       np_cols, &
+                                       my_blacs_ctxt, info
+      integer(kind=ik), intent(out)  :: na_rows, na_cols, sc_desc(1:9)
+
+      integer(kind=ik), parameter    :: error_unit=0
+#ifdef WITH_MPI
+      integer(kind=ik), external       :: numroc
+      integer(kind=ik)                 :: mpierr
+
+      sc_desc(:) = 0
+      ! determine the neccessary size of the distributed matrices,
+      ! we use the scalapack tools routine NUMROC
+
+      na_rows = numroc(na, nblk, my_prow, 0, np_rows)
+      na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
+
+      ! set up the scalapack descriptor for the checks below
+      ! For ELPA the following restrictions hold:
+      ! - block sizes in both directions must be identical (args 4 a. 5)
+      ! - first row and column of the distributed matrix must be on
+      !   row/col 0/0 (arg 6 and 7)
+
+      call descinit(sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info)
+
+      if (info .ne. 0) then
+        write(error_unit,*) 'Error in BLACS descinit! info=',info
+        write(error_unit,*) 'Most likely this happend since you want to use'
+        write(error_unit,*) 'more MPI tasks than are possible for your'
+        write(error_unit,*) 'problem size (matrix size and blocksize)!'
+        write(error_unit,*) 'The blacsgrid can not be set up properly'
+        write(error_unit,*) 'Try reducing the number of MPI tasks...'
+        call MPI_ABORT(mpi_comm_world, 1, mpierr)
+      endif
+#else /* WITH_MPI */
+      na_rows = na
+      na_cols = na
+#endif /* WITH_MPI */
+
+    end subroutine
+
+    !c> void set_up_blacs_descriptor_f(int na, int nblk, int my_prow, int my_pcol,
+    !c>                                int np_rows, int np_cols,
+    !c>                                int *na_rows, int *na_cols,
+    !c>                                int sc_desc[9],
+    !c>                                int my_blacs_ctxt,
+    !c>                                int *info);
+    subroutine set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, &
+                                         np_rows, np_cols, na_rows,  &
+                                         na_cols, sc_desc,           &
+                                         my_blacs_ctxt, info)        &
+                                         bind(C, name="set_up_blacs_descriptor_f")
+
+      use iso_c_binding
+      implicit none
+
+
+      integer(kind=c_int), value :: na, nblk, my_prow, my_pcol, np_rows, &
+                                    np_cols, my_blacs_ctxt
+      integer(kind=c_int)        :: na_rows, na_cols, info, sc_desc(1:9)
+
+      call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
+                                   np_rows, np_cols, na_rows,  &
+                                   na_cols, sc_desc, my_blacs_ctxt, info)
+
+
+    end subroutine
+
+    integer function index_l2g(idx_loc, nblk, iproc, nprocs)
+     index_l2g = nprocs * nblk * ((idx_loc-1) / nblk) + mod(idx_loc-1,nblk) + mod(nprocs+iproc, nprocs)*nblk + 1
+     return
+   end function
+
+end module
diff -Nru elpa-2016.05.001/test_project_C/src/test_real.c elpa-2019.11.001/test_project_C/src/test_real.c
--- elpa-2016.05.001/test_project_C/src/test_real.c	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/test_project_C/src/test_real.c	2019-12-19 09:47:44.000000000 +0000
@@ -0,0 +1,190 @@
+/*   This file is part of ELPA.
+
+     The ELPA library was originally created by the ELPA consortium,
+     consisting of the following organizations:
+
+     - Max Planck Computing and Data Facility (MPCDF), formerly known as
+       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
+     - Bergische Universität Wuppertal, Lehrstuhl für angewandte
+       Informatik,
+     - Technische Universität München, Lehrstuhl für Informatik mit
+       Schwerpunkt Wissenschaftliches Rechnen ,
+     - Fritz-Haber-Institut, Berlin, Abt. Theorie,
+     - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
+       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
+       and
+     - IBM Deutschland GmbH
+
+
+     More information can be found here:
+     http://elpa.mpcdf.mpg.de/
+
+     ELPA is free software: you can redistribute it and/or modify
+     it under the terms of the version 3 of the license of the
+     GNU Lesser General Public License as published by the Free
+     Software Foundation.
+
+     ELPA is distributed in the hope that it will be useful,
+     but WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+     GNU Lesser General Public License for more details.
+
+     You should have received a copy of the GNU Lesser General Public License
+     along with ELPA.  If not, see <http://www.gnu.org/licenses/>
+
+     ELPA reflects a substantial effort on the part of the original
+     ELPA consortium, and we ask you to respect the spirit of the
+     license that we chose: i.e., please contribute any changes you
+     may have back to the original ELPA library distribution, and keep
+     any derivatives of ELPA under the same license that we chose for
+     the original distribution, the GNU Lesser General Public License.
+*/
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <math.h>
+#include <string.h>
+
+#include <elpa/elpa.h>
+#include <assert.h>
+
+#define assert_elpa_ok(x) assert(x == ELPA_OK)
+
+int main(int argc, char** argv) {
+   /* matrix dimensions */
+   const int na = 1000;
+   const int nev = 500;
+   const int nblk = 16;
+
+   /* mpi */
+   int myid, nprocs;
+   int na_cols, na_rows;
+   int np_cols, np_rows;
+   int my_prow, my_pcol;
+   int mpi_comm;
+
+   /* blacs */
+   int my_blacs_ctxt, sc_desc[9], info;
+
+   /* The Matrix */
+   double *a, *as, *z;
+   double *ev;
+
+   int error, status;
+
+   elpa_t handle;
+
+   int value;
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+
+   for (np_cols = (int) sqrt((double) nprocs); np_cols > 1; np_cols--) {
+     if (nprocs % np_cols == 0) {
+       break;
+     }
+   }
+
+   np_rows = nprocs/np_cols;
+
+   /* set up blacs */
+   /* convert communicators before */
+   mpi_comm = MPI_Comm_c2f(MPI_COMM_WORLD);
+   set_up_blacsgrid_f(mpi_comm, np_rows, np_cols, 'C', &my_blacs_ctxt, &my_prow, &my_pcol);
+   set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
+
+   /* allocate the matrices needed for elpa */
+   a  = calloc(na_rows*na_cols, sizeof(double));
+   z  = calloc(na_rows*na_cols, sizeof(double));
+   as = calloc(na_rows*na_cols, sizeof(double));
+   ev = calloc(na, sizeof(double));
+
+   // TODO: prepare properly
+   memset(a, 0, na_rows * na_cols * sizeof(double));
+   //prepare_matrix_real_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
+
+   if (elpa_init(20170403) != ELPA_OK) {
+     fprintf(stderr, "Error: ELPA API version not supported");
+     exit(1);
+   }
+
+   if(myid == 0) printf("init done\n");
+
+   handle = elpa_allocate(&error);
+   assert_elpa_ok(error);
+
+   /* Set parameters */
+   elpa_set(handle, "na", na, &error);
+   assert_elpa_ok(error);
+
+   elpa_set(handle, "nev", nev, &error);
+   assert_elpa_ok(error);
+
+   elpa_set(handle, "local_nrows", na_rows, &error);
+   assert_elpa_ok(error);
+
+   elpa_set(handle, "local_ncols", na_cols, &error);
+   assert_elpa_ok(error);
+
+   elpa_set(handle, "nblk", nblk, &error);
+   assert_elpa_ok(error);
+
+   elpa_set(handle, "mpi_comm_parent", MPI_Comm_c2f(MPI_COMM_WORLD), &error);
+   assert_elpa_ok(error);
+
+   elpa_set(handle, "process_row", my_prow, &error);
+   assert_elpa_ok(error);
+
+   elpa_set(handle, "process_col", my_pcol, &error);
+   assert_elpa_ok(error);
+//
+   /* Setup */
+   assert_elpa_ok(elpa_setup(handle));
+
+   if(myid == 0) printf("setup done\n");
+   /* Set tunables */
+   elpa_set(handle, "solver", ELPA_SOLVER_1STAGE, &error);
+  // elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error);
+   assert_elpa_ok(error);
+
+//   elpa_set(handle, "real_kernel", TEST_KERNEL, &error);
+//   assert_elpa_ok(error);
+
+   if(myid == 0) printf("solve..\n");
+
+   /* Solve EV problem */
+   elpa_eigenvectors(handle, a, ev, z, &error);
+   assert_elpa_ok(error);
+   if(myid == 0) printf("solve done \n");
+
+//   for(int i = 0; i < na; i++)
+//       printf("%lf, ", ev[i]);
+//   printf("\n");
+
+   elpa_deallocate(handle);
+   elpa_uninit();
+
+
+   /* check the results */
+//   status = check_correctness_real_double_f(na, nev, na_rows, na_cols, as, z, ev, sc_desc, myid);
+
+//   if (status !=0){
+//     printf("The computed EVs are not correct !\n");
+//   }
+//   if (status ==0){
+//     printf("All ok!\n");
+//   }
+
+   free(a);
+   free(z);
+   free(as);
+   free(ev);
+
+   MPI_Finalize();
+
+   return !!status;
+}
diff -Nru elpa-2016.05.001/USERS_GUIDE_DEPRECATED_LEGACY_API.md elpa-2019.11.001/USERS_GUIDE_DEPRECATED_LEGACY_API.md
--- elpa-2016.05.001/USERS_GUIDE_DEPRECATED_LEGACY_API.md	1970-01-01 00:00:00.000000000 +0000
+++ elpa-2019.11.001/USERS_GUIDE_DEPRECATED_LEGACY_API.md	2019-12-20 05:57:47.000000000 +0000
@@ -0,0 +1,443 @@
+## Users guide for the *ELPA* library with the legacy interface ##
+
+**DISCLAIMER**
+This document provides some guidelines for using the legacy interface of the *ELPA* library with user applications.
+The legacy interface is deprecated and will be disabled at some point without any special anouncement.
+The following guidelines will not be updated or corrected anymore.
+**We strongly recommend all users to use the long-term supported new API of ELPA, which has been published with the
+release of 2017.05.001.**
+
+## A) Using the legacy API of the *ELPA* library ##
+
+The following description describes the usage of the *ELPA* library with the legacy interface.
+This legacy API is deprecated and will be disabled at some point. We strongly recommend all users
+to switch to the new API!. Nevertheless, for historic reasons we give some hints on how to use the legacy
+API.
+
+### A.1) General concept of the *ELPA* library ###
+
+The *ELPA* library consists of two main parts:
+- *ELPA 1stage* solver
+- *ELPA 2stage* solver
+
+Both variants of the *ELPA* solvers are available for real or complex singe and double precision valued matrices.
+
+Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.11.001/html/index.html) for details):
+
+- elpa_get_communicators                        : set the row / column communicators for *ELPA*
+- elpa_solve_evp_complex_1stage_{single|double} : solve a {single|double} precision complex eigenvalue proplem with the *ELPA 1stage* solver
+- elpa_solve_evp_real_1stage_{single|double}    : solve a {single|double} precision real eigenvalue proplem with the *ELPA 1stage* solver
+- elpa_solve_evp_complex_2stage_{single|double} : solve a {single|double} precision complex eigenvalue proplem with the *ELPA 2stage* solver
+- elpa_solve_evp_real_2stage_{single|double}    : solve a {single|double} precision real eigenvalue proplem with the *ELPA 2stage* solver
+- elpa_solve_evp_real_{single|double}           : driver for the {single|double} precision real *ELPA 1stage* or *ELPA 2stage* solver
+- elpa_solve_evp_complex_{single|double}        : driver for the {single|double} precision complex *ELPA 1stage* or *ELPA 2stage* solver
+
+
+
+Furthermore *ELPA* provides the utility binary "elpa2_print_available_kernels": it tells the user
+which *ELPA 2stage* compute kernels have been installed and which default kernels are set
+
+If you want to solve an eigenvalue problem with *ELPA*, you have to decide whether you
+want to use *ELPA 1stage* or *ELPA 2stage* solver. Normally, *ELPA 2stage* is the better
+choice since it is faster, but there are matrix dimensions where *ELPA 1stage* is superior.
+
+Independent of the choice of the solver, the concept of calling *ELPA* is always the same:
+
+### A.2) MPI version of *ELPA* ###
+
+In this case, *ELPA* relies on a BLACS distributed matrix.
+To solve a Eigenvalue problem of this matrix with *ELPA*, one has
+
+1. to include the *ELPA* header (C case) or module (Fortran)
+2. to create row and column MPI communicators for ELPA (with "elpa_get_communicators")
+3. to call to the *ELPA driver* or directly call *ELPA 1stage* or *ELPA 2stage* for the matrix.
+
+Here is a very simple MPI code snippet for using *ELPA 1stage*: For the definition of all variables
+please have a look at the man pages and/or the online documentation (see above). A full version
+of a simple example program can be found in ./test_project_1stage_legacy_api/src.
+
+
+   ! All ELPA routines need MPI communicators for communicating within
+   ! rows or columns of processes, these are set in elpa_get_communicators
+
+   success = elpa_get_communicators(mpi_comm_world, my_prow, my_pcol, &
+                                    mpi_comm_rows, mpi_comm_cols)
+
+   if (myid==0) then
+     print '(a)','| Past split communicator setup for rows and columns.'
+   end if
+
+   ! Determine the necessary size of the distributed matrices,
+   ! we use the Scalapack tools routine NUMROC for that.
+
+   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
+   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
+
+   !-------------------------------------------------------------------------------
+   ! Calculate eigenvalues/eigenvectors
+
+   if (myid==0) then
+     print '(a)','| Entering one-step ELPA solver ... '
+     print *
+   end if
+
+   success = elpa_solve_evp_real_1stage_{single|double} (na, nev, a, na_rows, ev, z, na_rows, nblk, &
+                                   matrixCols, mpi_comm_rows, mpi_comm_cols)
+
+   if (myid==0) then
+     print '(a)','| One-step ELPA solver complete.'
+     print *
+   end if
+
+
+#### Shared-memory version of *ELPA* ####
+
+If the *ELPA* library has been compiled with the configure option "--with-mpi=0",
+no MPI will be used.
+
+Still the **same** call sequence as in the MPI case can be used (see above).
+
+
+
+#### Setting the row and column communicators ####
+
+SYNOPSIS
+   FORTRAN INTERFACE
+       use elpa1
+
+       success = elpa_get_communicators (mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols)
+
+       integer, intent(in)   mpi_comm_global:  global communicator for the calculation
+       integer, intent(in)   my_prow:          row coordinate of the calling process in the process grid
+       integer, intent(in)   my_pcol:          column coordinate of the calling process in the process grid
+       integer, intent(out)  mpi_comm_row:     communicator for communication within rows of processes
+       integer, intent(out)  mpi_comm_row:     communicator for communication within columns of processes
+
+       integer               success:          return value indicating success or failure of the underlying MPI_COMM_SPLIT function
+
+   C INTERFACE
+       #include "elpa_generated.h"
+
+       success = elpa_get_communicators (int mpi_comm_world, int my_prow, my_pcol, int *mpi_comm_rows, int *Pmpi_comm_cols);
+
+       int mpi_comm_global:  global communicator for the calculation
+       int my_prow:          row coordinate of the calling process in the process grid
+       int my_pcol:          column coordinate of the calling process in the process grid
+       int *mpi_comm_row:    pointer to the communicator for communication within rows of processes
+       int *mpi_comm_row:    pointer to the communicator for communication within columns of processes
+
+       int  success:         return value indicating success or failure of the underlying MPI_COMM_SPLIT function
+
+
+#### Using *ELPA 1stage* ####
+
+After setting up the *ELPA* row and column communicators (by calling elpa_get_communicators),
+only the real or complex valued solver has to be called:
+
+SYNOPSIS
+   FORTRAN INTERFACE
+       use elpa1
+       success = elpa_solve_evp_real_1stage_{single|double} (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows,
+       mpi_comm_cols)
+
+       With the definintions of the input and output variables:
+
+       integer, intent(in)    na:            global dimension of quadratic matrix a to solve
+       integer, intent(in)    nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
+       real*{4|8},  intent(inout) a:         locally distributed part of the matrix a. The local dimensions are lda x matrixCols
+       integer, intent(in)    lda:           leading dimension of locally distributed matrix a
+       real*{4|8},  intent(inout) ev:        on output the first nev computed eigenvalues
+       real*{4|8},  intent(inout) q:         on output the first nev computed eigenvectors
+       integer, intent(in)    ldq:           leading dimension of matrix q which stores the eigenvectors
+       integer, intent(in)    nblk:          blocksize of block cyclic distributin, must be the same in both directions
+       integer, intent(in)    matrixCols:    number of columns of locally distributed matrices a and q
+       integer, intent(in)    mpi_comm_rows: communicator for communication in rows. Constructed with elpa_get_communicators(3)
+       integer, intent(in)    mpi_comm_cols: communicator for communication in colums. Constructed with elpa_get_communicators(3)
+
+       logical                success:       return value indicating success or failure
+
+   C INTERFACE
+       #include "elpa.h"
+
+       success = elpa_solve_evp_real_1stage_{single|double} (int na, int nev,  double *a, int lda,  double *ev, double *q, int ldq, int nblk, int matrixCols, int
+       mpi_comm_rows, int mpi_comm_cols);
+
+       With the definintions of the input and output variables:
+
+       int     na:            global dimension of quadratic matrix a to solve
+       int     nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
+       {float|double} *a:     pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols
+       int     lda:           leading dimension of locally distributed matrix a
+       {float|double} *ev:    pointer to memory containing on output the first nev computed eigenvalues
+       {float|double} *q:     pointer to memory containing on output the first nev computed eigenvectors
+       int     ldq:           leading dimension of matrix q which stores the eigenvectors
+       int     nblk:          blocksize of block cyclic distributin, must be the same in both directions
+       int     matrixCols:    number of columns of locally distributed matrices a and q
+       int     mpi_comm_rows: communicator for communication in rows. Constructed with elpa_get_communicators(3)
+       int     mpi_comm_cols: communicator for communication in colums. Constructed with elpa_get_communicators(3)
+
+       int     success:       return value indicating success (1) or failure (0)
+
+DESCRIPTION
+       Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the
+       elpa_get_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols.
+       The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues
+       will be stored in q. All memory of the arguments must be allocated outside the call to the solver.
+
+   FORTRAN INTERFACE
+       use elpa1
+       success = elpa_solve_evp_complex_1stage_{single|double} (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows,
+       mpi_comm_cols)
+
+       With the definintions of the input and output variables:
+
+       integer,     intent(in)    na:            global dimension of quadratic matrix a to solve
+       integer,     intent(in)    nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
+       complex*{8|16},  intent(inout) a:         locally distributed part of the matrix a. The local dimensions are lda x matrixCols
+       integer,     intent(in)    lda:           leading dimension of locally distributed matrix a
+       real*{4|8},      intent(inout) ev:        on output the first nev computed eigenvalues
+       complex*{8|16},  intent(inout) q:         on output the first nev computed eigenvectors
+       integer,     intent(in)    ldq:           leading dimension of matrix q which stores the eigenvectors
+       integer,     intent(in)    nblk:          blocksize of block cyclic distributin, must be the same in both directions
+       integer,     intent(in)    matrixCols:    number of columns of locally distributed matrices a and q
+       integer,     intent(in)    mpi_comm_rows: communicator for communication in rows. Constructed with elpa_get_communicators(3)
+       integer, intent(in)        mpi_comm_cols: communicator for communication in colums. Constructed with elpa_get_communicators(3)
+
+       logical                    success:       return value indicating success or failure
+
+   C INTERFACE
+       #include "elpa.h"
+       #include <complex.h>
+
+       success = elpa_solve_evp_complex_1stage_{single|double} (int na, int nev,  double complex *a, int lda,  double *ev, double complex*q, int ldq, int nblk, int
+       matrixCols, int mpi_comm_rows, int mpi_comm_cols);
+
+       With the definintions of the input and output variables:
+
+       int             na:            global dimension of quadratic matrix a to solve
+       int             nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
+       {float|double} complex *a:     pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols
+       int             lda:           leading dimension of locally distributed matrix a
+       {float|double}         *ev:    pointer to memory containing on output the first nev computed eigenvalues
+       {float|double} complex *q:     pointer to memory containing on output the first nev computed eigenvectors
+       int             ldq:           leading dimension of matrix q which stores the eigenvectors
+       int             nblk:          blocksize of block cyclic distributin, must be the same in both directions
+       int             matrixCols:    number of columns of locally distributed matrices a and q
+       int             mpi_comm_rows: communicator for communication in rows. Constructed with elpa_get_communicators(3)
+       int             mpi_comm_cols: communicator for communication in colums. Constructed with elpa_get_communicators(3)
+
+       int             success:       return value indicating success (1) or failure (0)
+
+DESCRIPTION
+       Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the
+       elpa_get_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols.
+       The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues
+       will be stored in q. All memory of the arguments must be allocated outside the call to the solver.
+
+
+The *ELPA 1stage* solver, does not need or accept any other parameters than in the above
+specification.
+
+#### Using *ELPA 2stage* ####
+
+The *ELPA 2stage* solver can be used in the same manner, as the *ELPA 1stage* solver.
+However, the 2 stage solver, can be used with different compute kernels, which offers
+more possibilities for configuration.
+
+It is recommended to first call the utility program
+
+elpa2_print_kernels
+
+which will tell all the compute kernels that can be used with *ELPA 2stage*
+
+##### Using the default kernels #####
+
+If no kernel is set via the *ELPA 2stage API* then the default kernels will be set.
+
+##### Setting the *ELPA 2stage* compute kernels #####
+
+##### Setting the *ELPA 2stage* compute kernels with environment variables#####
+
+
+The utility program "elpa2_print_kernels" can list which kernels are available and which
+would be chosen. This reflects the setting of the default kernel.
+
+##### Setting the *ELPA 2stage* compute kernels with API calls#####
+
+It is also possible to set the *ELPA 2stage* compute kernels via the API.
+
+As an example the API for ELPA real double-precision 2stage is shown:
+
+SYNOPSIS
+   FORTRAN INTERFACE
+       use elpa1
+       use elpa2
+       success = elpa_solve_evp_real_2stage_double (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows,
+       mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)
+
+       With the definintions of the input and output variables:
+
+       integer, intent(in)            na:            global dimension of quadratic matrix a to solve
+       integer, intent(in)            nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
+       real*{4|8},  intent(inout)         a:         locally distributed part of the matrix a. The local dimensions are lda x matrixCols
+       integer, intent(in)            lda:           leading dimension of locally distributed matrix a
+       real*{4|8},  intent(inout)         ev:        on output the first nev computed eigenvalues
+       real*{4|8},  intent(inout)         q:         on output the first nev computed eigenvectors
+       integer, intent(in)            ldq:           leading dimension of matrix q which stores the eigenvectors
+       integer, intent(in)            nblk:          blocksize of block cyclic distributin, must be the same in both directions
+       integer, intent(in)            matrixCols:    number of columns of locally distributed matrices a and q
+       integer, intent(in)            mpi_comm_rows: communicator for communication in rows. Constructed with elpa_get_communicators(3)
+       integer, intent(in)            mpi_comm_cols: communicator for communication in colums. Constructed with elpa_get_communicators(3)
+       integer, intent(in)            mpi_comm_all:  communicator for all processes in the processor set involved in ELPA
+       logical, intent(in), optional: useQR:         optional argument; switches to QR-decomposition if set to .true.
+       logical, intent(in), optional: useGPU:        decide whether GPUs should be used ore not
+
+      logical                        success:       return value indicating success or failure
+
+   C INTERFACE
+       #include "elpa.h"
+
+       success = elpa_solve_evp_real_2stage_double (int na, int nev,  double *a, int lda,  double *ev, double *q, int ldq, int nblk, int matrixCols, int
+       mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_ELPA_REAL_KERNEL, int useQR, int useGPU);
+
+       With the definintions of the input and output variables:
+
+       int     na:            global dimension of quadratic matrix a to solve
+       int     nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
+       double *a:             pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols
+       int     lda:           leading dimension of locally distributed matrix a
+       double *ev:            pointer to memory containing on output the first nev computed eigenvalues
+       double *q:             pointer to memory containing on output the first nev computed eigenvectors
+       int     ldq:           leading dimension of matrix q which stores the eigenvectors
+       int     nblk:          blocksize of block cyclic distributin, must be the same in both directions
+       int     matrixCols:    number of columns of locally distributed matrices a and q
+       int     mpi_comm_rows: communicator for communication in rows. Constructed with elpa_get_communicators(3)
+       int     mpi_comm_cols: communicator for communication in colums. Constructed with elpa_get_communicators(3)
+       int     mpi_comm_all:  communicator for all processes in the processor set involved in ELPA
+       int     useQR:         if set to 1 switch to QR-decomposition
+       int     useGPU:        decide whether the GPU version should be used or not
+
+       int     success:       return value indicating success (1) or failure (0)
+
+
+DESCRIPTION
+       Solve the real eigenvalue problem with the 2-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the
+       elpa_get_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols.
+       The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues
+       will be stored in q. All memory of the arguments must be allocated outside the call to the solver.
+
+##### Setting up *ELPA 1stage* or *ELPA 2stage* with the *ELPA driver interface* #####
+
+Since release ELPA 2016.005.004 a driver routine allows to choose more easily which solver (1stage or 2stage) will be used.
+
+As an exmple the real double-precision case is explained:
+
+ SYNOPSIS
+
+ FORTRAN INTERFACE
+
+  use elpa_driver
+
+  success = elpa_solve_evp_real_double (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL=THIS_REAL_ELPA_KERNEL, useQR, useGPU, method=method)
+
+  Generalized interface to the ELPA 1stage and 2stage solver for real-valued problems
+
+  With the definintions of the input and output variables:
+
+
+  integer, intent(in)            na:                    global dimension of quadratic matrix a to solve
+
+  integer, intent(in)            nev:                   number of eigenvalues to be computed; the first nev eigenvalules are calculated
+
+  real*8,  intent(inout)         a:                     locally distributed part of the matrix a. The local dimensions are lda x matrixCols
+
+  integer, intent(in)            lda:                   leading dimension of locally distributed matrix a
+
+  real*8,  intent(inout)         ev:                    on output the first nev computed eigenvalues"
+
+  real*8,  intent(inout)         q:                     on output the first nev computed eigenvectors"
+
+  integer, intent(in)            ldq:                   leading dimension of matrix q which stores the eigenvectors
+
+  integer, intent(in)            nblk:                  blocksize of block cyclic distributin, must be the same in both directions
+
+  integer, intent(in)            matrixCols:            number of columns of locally distributed matrices a and q
+
+  integer, intent(in)            mpi_comm_rows:         communicator for communication in rows. Constructed with elpa_get_communicators
+
+  integer, intent(in)            mpi_comm_cols:         communicator for communication in colums. Constructed with elpa_get_communicators
+
+  integer, intent(in)            mpi_comm_all:          communicator for all processes in the processor set involved in ELPA
+
+  integer, intent(in), optional: THIS_REAL_ELPA_KERNEL: optional argument, choose the compute kernel for 2-stage solver
+
+  logical, intent(in), optional: useQR:                 optional argument; switches to QR-decomposition if set to .true.
+
+  logical, intent(in), optional: useQPU:                decide whether the GPU version should be used or not
+
+  character(*), optional         method:                use 1stage solver if "1stage", use 2stage solver if "2stage", (at the moment) use 2stage solver if "auto"
+
+  logical                        success:               return value indicating success or failure
+
+
+ C INTERFACE
+
+ #include "elpa.h"
+
+ success = elpa_solve_evp_real_double (int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_ELPA_REAL_KERNEL, int useQR, int useGPU, char *method);"
+
+
+ With the definintions of the input and output variables:"
+
+
+ int     na:                    global dimension of quadratic matrix a to solve
+
+ int     nev:                   number of eigenvalues to be computed; the first nev eigenvalules are calculated
+
+ double *a:                     pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols
+
+ int     lda:                   leading dimension of locally distributed matrix a
+
+ double *ev:                    pointer to memory containing on output the first nev computed eigenvalues
+
+ double *q:                     pointer to memory containing on output the first nev computed eigenvectors
+
+ int     ldq:                   leading dimension of matrix q which stores the eigenvectors
+
+ int     nblk:                  blocksize of block cyclic distributin, must be the same in both directions
+
+ int     matrixCols:            number of columns of locally distributed matrices a and q
+
+ int     mpi_comm_rows:         communicator for communication in rows. Constructed with elpa_get_communicators
+
+ int     mpi_comm_cols:         communicator for communication in colums. Constructed with elpa_get_communicators
+
+ int     mpi_comm_all:          communicator for all processes in the processor set involved in ELPA
+
+ int     THIS_ELPA_REAL_KERNEL: choose the compute kernel for 2-stage solver
+
+ int     useQR:                 if set to 1 switch to QR-decomposition
+
+ int     useGPU:                decide whether the GPU version should be used or not
+
+ char   *method:                use 1stage solver if "1stage", use 2stage solver if "2stage", (at the moment) use 2stage solver if "auto"
+
+ int     success:               return value indicating success (1) or failure (0)
+
+ DESCRIPTION
+ Solve the real eigenvalue problem. The value of method desides whether the 1stage or 2stage solver is used. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the elpa_get_communicators function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols. The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues will be stored in q. All memory of the arguments must be allocated outside the call to the solver.
+
+##### Setting up the GPU version of *ELPA* 1 and 2 stage #####
+
+Since release ELPA 2016.011.001.pre *ELPA* offers GPU support, IF *ELPA* has been build with the configure option "--enabble-gpu-support".
+
+At run-time the GPU version can be used by setting the environment variable "ELPA_USE_GPU" to "yes", or by calling the *ELPA* functions
+(elpa_solve_evp_real_{double|single}, elpa_solve_evp_real_1stage_{double|single}, elpa_solve_evp_real_2stage_{double|single}) with the
+argument "useGPU = .true." or "useGPU = 1" for the Fortran and C case, respectively. Please, not that similiar to the choice of the
+*ELPA* 2stage compute kernels, the enviroment variable takes precendence over the setting in the API call.
+
+Further note that it is NOT allowed to define the usage of GPUs AND to EXPLICITLY set an ELPA 2stage compute kernel other than
+"REAL_ELPA_KERNEL_GPU" or "COMPLEX_ELPA_KERNEL_GPU".
+
+
+
diff -Nru elpa-2016.05.001/USERS_GUIDE.md elpa-2019.11.001/USERS_GUIDE.md
--- elpa-2016.05.001/USERS_GUIDE.md	2016-05-20 07:40:52.000000000 +0000
+++ elpa-2019.11.001/USERS_GUIDE.md	2019-12-20 05:57:47.000000000 +0000
@@ -1,365 +1,503 @@
-## Users guide for the ELPA library ##
+## Users guide for the *ELPA* library ##
 
-This document provides the guide for using the *ELPA* library in user applications.
+This document provides the guide for using the *ELPA* library with the new API (API version 20170403 or higher).
+Please do understand that this release deprecated the old, legacy interface API!
+If you want to use the deprecated legacy API (we strongly recommend against this), please use the ELPA release
+2019.05.002 or older.
+
+If you need instructions on how to build *ELPA*, please look at [INSTALL.md] (INSTALL.md).
 
 ### Online and local documentation ###
 
 Local documentation (via man pages) should be available (if *ELPA* has been installed with the documentation):
 
-For example "man get_elpa_communicators" should provide the documentation for the *ELPA* function which sets
-the necessary communicators.
+For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program, which prints all
+the available kernels.
 
-Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2016.05.001/html/index.html)
+Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.11.001/html/index.html)
 for each *ELPA* release is available.
 
-### General concept of the  *ELPA* library ###
 
-The *ELPA* library consists of two main parts:
-- *ELPA 1stage* solver
-- *ELPA 2stage* solver
+### API of the *ELPA* library ###
+
+With release 2017.05.001 of the *ELPA* library the interface has been rewritten substantially, in order to have a more generic 
+interface and to avoid future interface changes.
+
+The release ELPA 2018.11.001 was the last release, where the legacy API has been
+enabled by default (and can be disabled at build time).
+With release ELPA 2019.05.001 the legacy API is disabled by default, however,
+can be still switched on at build time.
+With the release ELPA 2019.11.001 the legacy API will has been deprecated and has been removed.
+
+
+### Table of Contents: ###
+
+- I)   General concept of the *ELPA* API
+- II)  List of supported tunable parameters
+- III) List of computational routines
+- IV)  Using OpenMP threading
+- V)   Influencing default values with environment variables
+- VI)  Autotuning
+- VII) A simple example how to use ELPA in an MPI application
+
+## I) General concept of the *ELPA* API ##
+
+Using *ELPA* just requires a few steps:
+
+- include elpa headers "elpa/elpa.h" (C-Case) or use the Fortran module "use elpa"
+
+- define a instance of the elpa type
+
+- call elpa_init
+
+- call elpa_allocate to allocate an instance of *ELPA*
+  note that you can define (and configure individually) as many different instances
+  for ELPA as you want, e.g. one for CPU only computations and for larger matrices on GPUs
+
+- use ELPA-type function "set" to set matrix and MPI parameters
+
+- call the ELPA-type function "setup"
+
+- set or get all possible ELPA tunable options with ELPA-type functions get/set
+
+- call ELPA-type function solve or others
+
+- if the ELPA object is not needed any more call ELPA-type function destroy
+
+- call elpa_uninit at the end of the program
+
+To be more precise a basic call sequence for Fortran and C looks as follows:
+
+Fortran synopsis
+
+```Fortran
+ use elpa
+ class(elpa_t), pointer :: elpa
+ integer :: success
+
+ if (elpa_init(20171201) /= ELPA_OK) then        ! put here the API version that you are using
+    print *, "ELPA API version not supported"
+    stop
+  endif
+  elpa => elpa_allocate(success)
+  if (success != ELPA_OK) then
+    ! react on the error
+    ! we urge every user to always check the error codes
+    ! of all ELPA functions
+  endif
+
+  ! set parameters decribing the matrix and it's MPI distribution
+  call elpa%set("na", na, success)                          ! size of the na x na matrix
+  call elpa%set("nev", nev, success)                        ! number of eigenvectors that should be computed ( 1<= nev <= na)
+  call elpa%set("local_nrows", na_rows, success)            ! number of local rows of the distributed matrix on this MPI task 
+  call elpa%set("local_ncols", na_cols, success)            ! number of local columns of the distributed matrix on this MPI task
+  call elpa%set("nblk", nblk, success)                      ! size of the BLACS block cyclic distribution
+  call elpa%set("mpi_comm_parent", MPI_COMM_WORLD, success) ! the global MPI communicator
+  call elpa%set("process_row", my_prow, success)            ! row coordinate of MPI process
+  call elpa%set("process_col", my_pcol, success)            ! column coordinate of MPI process
+
+  success = elpa%setup()
+
+  ! if desired, set any number of tunable run-time options
+  ! look at the list of possible options as detailed later in
+  ! USERS_GUIDE.md
+  call e%set("solver", ELPA_SOLVER_2STAGE, success)
+
+  ! set the AVX BLOCK2 kernel, otherwise ELPA_2STAGE_REAL_DEFAULT will
+  ! be used
+  call e%set("real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, success)
+
+  ! use method solve to solve the eigenvalue problem to obtain eigenvalues
+  ! and eigenvectors
+  ! other possible methods are desribed in USERS_GUIDE.md
+  call e%eigenvectors(a, ev, z, success)
+
+  ! cleanup
+  call elpa_deallocate(e)
+
+  call elpa_uninit()
+```
+
+C Synopsis:
+```C
+   #include <elpa/elpa.h>
+
+   elpa_t handle;
+   int error;
+
+   if (elpa_init(20171201) != ELPA_OK) {                          // put here the API version that you are using
+     fprintf(stderr, "Error: ELPA API version not supported");
+     exit(1);
+   }
+
+   handle = elpa_allocate(&error);
+   if (error != ELPA_OK) {
+     /* react on the error code */
+     /* we urge the user to always check the error codes of all ELPA functions */
+   }
+
+
+   /* Set parameters the matrix and it's MPI distribution */
+   elpa_set(handle, "na", na, &error);                                           // size of the na x na matrix
+   elpa_set(handle, "nev", nev, &error);                                         // number of eigenvectors that should be computed ( 1<= nev <= na)
+   elpa_set(handle, "local_nrows", na_rows, &error);                             // number of local rows of the distributed matrix on this MPI task 
+   elpa_set(handle, "local_ncols", na_cols, &error);                             // number of local columns of the distributed matrix on this MPI task
+   elpa_set(handle, "nblk", nblk, &error);                                       // size of the BLACS block cyclic distribution
+   elpa_set(handle, "mpi_comm_parent", MPI_Comm_c2f(MPI_COMM_WORLD), &error);    // the global MPI communicator
+   elpa_set(handle, "process_row", my_prow, &error);                             // row coordinate of MPI process
+   elpa_set(handle, "process_col", my_pcol, &error);                             // column coordinate of MPI process
+
+   /* Setup */
+   error = elpa_setup(handle);
+
+   /* if desired, set any number of tunable run-time options */
+   /* look at the list of possible options as detailed later in
+      USERS_GUIDE.md */
+
+   elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error);
+  
+   // set the AVX BLOCK2 kernel, otherwise ELPA_2STAGE_REAL_DEFAULT will
+   // be used
+   elpa_set(handle, "real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, &error)
+
+   /* use method solve to solve the eigenvalue problem */
+   /* other possible methods are desribed in USERS_GUIDE.md */
+   elpa_eigenvectors(handle, a, ev, z, &error);
+
+   /* cleanup */
+   elpa_deallocate(handle);
+   elpa_uninit();
+```
+
+## II) List of supported tunable parameters ##
 
-Both variants of the *ELPA* solvers are available for real or complex valued matrices.
+The following table gives a list of all supported parameters which can be used to tune (influence) the runtime behaviour of *ELPA* ([see here if you cannot read it in your editor] (https://gitlab.mpcdf.mpg.de/elpa/elpa/wikis/USERS_GUIDE))
 
-Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2016.05.001/html/index.html) for details):
+| Parameter name | Short description     | default value               | possible values         | since API version | 
+| :------------- |:--------------------- | :-------------------------- | :---------------------- | :---------------- | 
+| solver         | use ELPA 1 stage <br>  or 2 stage solver | ELPA_SOLVER_1STAGE          | ELPA_SOLVER_1STAGE <br> ELPA_SOLVER_2STAGE      | 20170403          |
+| gpu            | use GPU (if build <br> with GPU support)| 0                           | 0 or 1             | 20170403          | 
+| real_kernel    | real kernel to be <br> used in ELPA 2 | ELPA_2STAGE_REAL_DEFAULT    | see output of <br> elpa2_print_kernels    | 20170403          |
+| complex kernel | complex kernel to <br>  be used in ELPA 2 | ELPA_2STAGE_COMPLEX_DEFAULT | see output of <br>  elpa2_print_kernels     | 20170403          |
+| omp_threads    | OpenMP threads used <br> (if build with OpenMP <br> support) | 1 | >1 | 20180525 |
+| qr | Use QR decomposition in <br> ELPA 2 real | 0 | 0 or 1 |  20170403  |
+| timings | Enable time <br> measurement | 1 | 0 or 1 |  20170403  |
+| debug | give debug information | 0 | 0 or 1 | 20170403  |
+       
 
-- get_elpa_communicators   : set the row / column communicators for *ELPA*
-- solve_evp_complex_1stage : solve a complex valued eigenvale proplem with the *ELPA 1stage* solver
-- solve_evp_real_1stage    : solve a real valued eigenvale proplem with the *ELPA 1stage* solver
-- solve_evp_complex_2stage : solve a complex valued eigenvale proplem with the *ELPA 2stage* solver
-- solve_evp_real_2stage    : solve a real valued eigenvale proplem with the *ELPA 2stage* solver
+## III) List of computational routines ##
 
-Furthermore *ELPA* provides the utility binary "print_available_elpa2_kernels": it tells the user
-which *ELPA 2stage* compute kernels have been installed and which default kernels are set
+The following compute routines are available in *ELPA*: Please have a look at the man pages or  [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.11.001/html/index.html) for details.
 
-If you want to solve an eigenvalue problem with *ELPA*, you have to decide whether you
-want to use *ELPA 1stage* or *ELPA 2stage* solver. Normally, *ELPA 2stage* is the better
-choice since it is faster, but there a matrix dimensions where *ELPA 1stage* is supperior.
 
-Independent of the choice of the solver, the concept of calling *ELPA* is always the same:
+| Name         | Purpose                                                                 | since API version |
+| :----------- | :---------------------------------------------------------------------- | :---------------- |
+| eigenvectors | solve std. eigenvalue problem <br> compute eigenvalues and eigenvectors | 20170403  |
+| eigenvalues  | solve std. eigenvalue problem <br> compute eigenvalues only             | 20170403  |
+| generalized_eigenvectors | solve generalized eigenvalule problem <br> compute eigenvalues and eigenvectors | 20180525 |
+| generalized_eigenvalues  | solve generalized eigenvalule problem <br> compute eigenvalues only             | 20180525 |
+| hermitian_multiply       | do (real) a^T x b <br> (complex) a^H x b                                        | 20170403 |
+| cholesky                 | do cholesky factorisation                                                       | 20170403 |
+| invert_triangular        | invert a upper triangular matrix                                                | 20170403 |
+| solve_tridiagonal        | solve EVP for a tridiagonal matrix                                              | 20170403 |
 
-#### MPI version of *ELPA* ####
 
-In this case, *ELPA* relies on a BLACS distributed matrix.
-To solve a Eigenvalue problem of this matrix with *ELPA*, one has
+## IV) Using OpenMP threading ##
 
-1. to include the *ELPA* header (C case) or module (Fortran)
-2. to create row and column MPI communicators for ELPA (with "get_elpa_communicators")
-3. to call *ELPA 1stage* or *ELPA 2stage* for the matrix.
+If *ELPA* has been build with OpenMP threading support you can specify the number of OpenMP threads that *ELPA* will use internally.
+Please note that it is **mandatory**  to set the number of threads to be used with the OMP_NUM_THREADS environment variable **and**
+with the **set method** 
 
-Here is a very simple MPI code snippet for using *ELPA 1stage*: For the definition of all variables
-please have a look at the man pages and/or the online documentation (see above).  A full version
-of a simple example program can be found in ./test_project/src.
+```Fortran
+call e%set("omp_threads", 4, error)
+```
 
+**or the *ELPA* environment variable**
 
-   ! All ELPA routines need MPI communicators for communicating within
-   ! rows or columns of processes, these are set in get_elpa_communicators
+export ELPA_DEFAULT_omp_threads=4 (see Section V for an explanation of this variable).
 
-   success = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, &
-                                    mpi_comm_rows, mpi_comm_cols)
+Just setting the environment variable OMP_NUM_THREADS is **not** sufficient.
 
-   if (myid==0) then
-     print '(a)','| Past split communicator setup for rows and columns.'
-   end if
+This is necessary to make the threading an autotunable option.
 
-   ! Determine the necessary size of the distributed matrices,
-   ! we use the Scalapack tools routine NUMROC for that.
+## V) Influencing default values with environment variables ##
 
-   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
-   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
+For each tunable parameter mentioned in Section II, there exists a default value. This means, that if this parameter is **not explicitly** set by the user by the
+*ELPA* set method, *ELPA* takes the default value for the parameter. E.g. if the user does not set a solver method, than *ELPA* will take the default "ELPA_SOLVER_1STAGE".
 
-   !-------------------------------------------------------------------------------
-   ! Calculate eigenvalues/eigenvectors
+The user can change this default value by setting an enviroment variable to the desired value.
 
-   if (myid==0) then
-     print '(a)','| Entering one-step ELPA solver ... '
-     print *
-   end if
+The name of this variable is always constructed in the following way:
+```
+ELPA_DEFAULT_tunable_parameter_name=value
+```
 
-   success = solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
-                                   matrixCols, mpi_comm_rows, mpi_comm_cols)
+, e.g. in case of the solver the user can
 
-   if (myid==0) then
-     print '(a)','| One-step ELPA solver complete.'
-     print *
-   end if
+```
+export ELPA_DEFAULT_solver=ELPA_SOLVER_2STAGE
+```
 
+in order to define the 2stage solver as the default.
 
-#### Shared-memory version of *ELPA* ####
+**Important note**
+The default valule is completly ignored, if the user has manually set a parameter-value pair with the *ELPA* set method!
+Thus the above environemnt variable will **not** have an effect, if the user code contains a line
+```Fortran
+call e%set("solver",ELPA_SOLVER_1STAGE,error)
+```
+.
 
-If the *ELPA* library has been compiled with the configure option "--with-mpi=0",
-no MPI will be used.
+## VI) Using autotuning ##
 
-Still the **same** call sequence as in the MPI case can be used (see above).
+Since API version 20171201 *ELPA* supports the autotuning of some "tunable" parameters (see Section II). The idea is that if *ELPA* is called multiple times (like typical in
+self-consistent-iterations) some parameters can be tuned to an optimal value, which is hard to set for the user. Note, that not every parameter mentioned in Section II can actually be tuned with the autotuning. At the moment, only the parameters mentioned in the table below are affected by autotuning.
 
-#### Setting the row and column communicators ####
+There are two ways, how the user can influence the autotuning steps:
 
-SYNOPSIS
-   FORTRAN INTERFACE
-       use elpa1
+1.) the user can set one of the following autotuning levels
+- ELPA_AUTOTUNE_FAST
+- ELPA_AUTOTUNE_MEDIUM
 
-       success = get_elpa_communicators (mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols)
+Each level defines a different set of tunable parameter. The autouning option will be extended by future releases of the *ELPA* library, at the moment the following
+sets are supported: 
 
-       integer, intent(in)   mpi_comm_global:  global communicator for the calculation
-       integer, intent(in)   my_prow:          row coordinate of the calling process in the process grid
-       integer, intent(in)   my_pcol:          column coordinate of the calling process in the process grid
-       integer, intent(out)  mpi_comm_row:     communicator for communication within rows of processes
-       integer, intent(out)  mpi_comm_row:     communicator for communication within columns of processes
+| AUTOTUNE LEVEL          | Parameters                                              |
+| :---------------------- | :------------------------------------------------------ |
+| ELPA_AUTOTUNE_FAST      | { solver, real_kernel, complex_kernel, omp_threads }    |
+| ELPA_AUTOTUNE_MEDIUM    | all of abvoe + { gpu, partly gpu }                      |
+| ELPA_AUTOTUNE_EXTENSIVE | all of above + { various blocking factors, stripewidth, |
+|                         | intermediate_bandwidth }                                |
 
-       integer               success:          return value indicating success or failure of the underlying MPI_COMM_SPLIT function
+2.) the user can **remove** tunable parameters from the list of autotuning possibilites by explicetly setting this parameter,
+e.g. if the user sets in his code 
 
-   C INTERFACE
-       #include "elpa_generated.h"
+```Fortran
+call e%set("solver", ELPA_SOLVER_2STAGE, error)
+```
+**before** invoking the autotuning, then the solver is fixed and not considered anymore for autotuning. Thus the ELPA_SOLVER_1STAGE would be skipped and, consequently, all possible autotuning parameters, which depend on ELPA_SOLVER_1STAGE.
 
-       success = get_elpa_communicators (int mpi_comm_world, int my_prow, my_pcol, int *mpi_comm_rows, int *Pmpi_comm_cols);
+The user can invoke autotuning in the following way:
 
-       int mpi_comm_global:  global communicator for the calculation
-       int my_prow:          row coordinate of the calling process in the process grid
-       int my_pcol:          column coordinate of the calling process in the process grid
-       int *mpi_comm_row:    pointer to the communicator for communication within rows of processes
-       int *mpi_comm_row:    pointer to the communicator for communication within columns of processes
 
-       int  success:         return value indicating success or failure of the underlying MPI_COMM_SPLIT function
+Fortran synopsis
 
+```Fortran
+ ! prepare elpa as you are used to (see Section I)
+ ! only steps for autotuning are commentd
+ use elpa
+ class(elpa_t), pointer :: elpa
+ class(elpa_autotune_t), pointer :: tune_state   ! create an autotuning pointer
+ integer :: success
 
-#### Using *ELPA 1stage* ####
+ if (elpa_init(20171201) /= ELPA_OK) then
+    print *, "ELPA API version not supported"
+    stop
+  endif
+  elpa => elpa_allocate(success)
 
-After setting up the *ELPA* row and column communicators (by calling get_elpa_communicators),
-only the real or complex valued solver has to be called:
+  ! set parameters decribing the matrix and it's MPI distribution
+  call elpa%set("na", na, success)
+  call elpa%set("nev", nev, success))
+  call elpa%set("local_nrows", na_rows, success)
+  call elpa%set("local_ncols", na_cols, success)
+  call elpa%set("nblk", nblk, success)
+  call elpa%set("mpi_comm_parent", MPI_COMM_WORLD, success)
+  call elpa%set("process_row", my_prow, success)
+  call elpa%set("process_col", my_pcol, success)
 
-SYNOPSIS
-   FORTRAN INTERFACE
-       use elpa1
-       success = solve_evp_real_1stage (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows,
-       mpi_comm_cols)
+  success = elpa%setup()
 
-       With the definintions of the input and output variables:
+  tune_state => e%autotune_setup(ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_REAL, success)   ! prepare autotuning, set AUTOTUNE_LEVEL and the domain (real or complex)
 
-       integer, intent(in)    na:            global dimension of quadratic matrix a to solve
-       integer, intent(in)    nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
-       real*8,  intent(inout) a:             locally distributed part of the matrix a. The local dimensions are lda x matrixCols
-       integer, intent(in)    lda:           leading dimension of locally distributed matrix a
-       real*8,  intent(inout) ev:            on output the first nev computed eigenvalues
-       real*8,  intent(inout) q:             on output the first nev computed eigenvectors
-       integer, intent(in)    ldq:           leading dimension of matrix q which stores the eigenvectors
-       integer, intent(in)    nblk:          blocksize of block cyclic distributin, must be the same in both directions
-       integer, intent(in)    matrixCols:    number of columns of locally distributed matrices a and q
-       integer, intent(in)    mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3)
-       integer, intent(in)    mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3)
+  ! do the loop of subsequent ELPA calls which will be used to do the autotuning
+  do i=1, scf_cycles
+    unfinished = e%autotune_step(tune_state, success)   ! check whether autotuning is finished; If not do next step
 
-       logical                success:       return value indicating success or failure
+    if (.not.(unfinished)) then
+      print *,"autotuning finished at step ",i
+    endif
 
-   C INTERFACE
-       #include "elpa.h"
+    call e%eigenvectors(a, ev, z, success)       ! do the normal computation
 
-       success = solve_evp_real_1stage (int na, int nev,  double *a, int lda,  double *ev, double *q, int ldq, int nblk, int matrixCols, int
-       mpi_comm_rows, int mpi_comm_cols);
+  enddo
 
-       With the definintions of the input and output variables:
+  call e%autotune_set_best(tune_state, success)         ! from now use the values found by autotuning
 
-       int     na:            global dimension of quadratic matrix a to solve
-       int     nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
-       double *a:             pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols
-       int     lda:           leading dimension of locally distributed matrix a
-       double *ev:            pointer to memory containing on output the first nev computed eigenvalues
-       double *q:             pointer to memory containing on output the first nev computed eigenvectors
-       int     ldq:           leading dimension of matrix q which stores the eigenvectors
-       int     nblk:          blocksize of block cyclic distributin, must be the same in both directions
-       int     matrixCols:    number of columns of locally distributed matrices a and q
-       int     mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3)
-       int     mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3)
+  call elpa_autotune_deallocate(tune_state)    ! cleanup autotuning object 
+```
 
-       int     success:       return value indicating success (1) or failure (0)
+C Synopsis
+```C
+   /* prepare ELPA the usual way; only steps for autotuning are commented */
+   #include <elpa/elpa.h>
 
-DESCRIPTION
-       Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the
-       get_elpa_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols.
-       The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues
-       will be stored in q. All memory of the arguments must be allocated outside the call to the solver.
+   elpa_t handle;
+   elpa_autotune_t autotune_handle;                               // handle for autotuning
+   int error;
 
-   FORTRAN INTERFACE
-       use elpa1
-       success = solve_evp_complex_1stage (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows,
-       mpi_comm_cols)
+   if (elpa_init(20171201) != ELPA_OK) { 
+     fprintf(stderr, "Error: ELPA API version not supported");
+     exit(1);
+   }
 
-       With the definintions of the input and output variables:
+   handle = elpa_allocate(&error);
 
-       integer,     intent(in)    na:            global dimension of quadratic matrix a to solve
-       integer,     intent(in)    nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
-       complex*16,  intent(inout) a:             locally distributed part of the matrix a. The local dimensions are lda x matrixCols
-       integer,     intent(in)    lda:           leading dimension of locally distributed matrix a
-       real*8,      intent(inout) ev:            on output the first nev computed eigenvalues
-       complex*16,  intent(inout) q:             on output the first nev computed eigenvectors
-       integer,     intent(in)    ldq:           leading dimension of matrix q which stores the eigenvectors
-       integer,     intent(in)    nblk:          blocksize of block cyclic distributin, must be the same in both directions
-       integer,     intent(in)    matrixCols:    number of columns of locally distributed matrices a and q
-       integer,     intent(in)    mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3)
-       integer, intent(in)        mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3)
+   /* Set parameters the matrix and it's MPI distribution */
+   elpa_set(handle, "na", na, &error);
+   elpa_set(handle, "nev", nev, &error);
+   elpa_set(handle, "local_nrows", na_rows, &error);
+   elpa_set(handle, "local_ncols", na_cols, &error);
+   elpa_set(handle, "nblk", nblk, &error);
+   elpa_set(handle, "mpi_comm_parent", MPI_Comm_c2f(MPI_COMM_WORLD), &error);
+   elpa_set(handle, "process_row", my_prow, &error);
+   elpa_set(handle, "process_col", my_pcol, &error);
+   /* Setup */
+   elpa_setup(handle);
 
-       logical                    success:       return value indicating success or failure
+   autotune_handle = elpa_autotune_setup(handle, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, &error);   // create autotune object
 
-   C INTERFACE
-       #include "elpa.h"
-       #include <complex.h>
+   // repeatedl call ELPA, e.g. in an scf iteration
+   for (i=0; i < scf_cycles; i++) {
 
-       success = solve_evp_complex_1stage (int na, int nev,  double complex *a, int lda,  double *ev, double complex*q, int ldq, int nblk, int
-       matrixCols, int mpi_comm_rows, int mpi_comm_cols);
+     unfinished = elpa_autotune_step(handle, autotune_handle, &error);      // check whether autotuning finished. If not do next step
 
-       With the definintions of the input and output variables:
+     if (unfinished == 0) {
+       printf("ELPA autotuning finished in the %d th scf step \n",i);
+      }
 
-       int             na:            global dimension of quadratic matrix a to solve
-       int             nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
-       double complex *a:             pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols
-       int             lda:           leading dimension of locally distributed matrix a
-       double         *ev:            pointer to memory containing on output the first nev computed eigenvalues
-       double complex *q:             pointer to memory containing on output the first nev computed eigenvectors
-       int             ldq:           leading dimension of matrix q which stores the eigenvectors
-       int             nblk:          blocksize of block cyclic distributin, must be the same in both directions
-       int             matrixCols:    number of columns of locally distributed matrices a and q
-       int             mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3)
-       int             mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3)
 
-       int             success:       return value indicating success (1) or failure (0)
+      /* do the normal computation */
+      elpa_eigenvectors(handle, a, ev, z, &error);
+   }
+   elpa_autotune_set_best(handle, autotune_handle &error);  // from now on use values used by autotuning
+   elpa_autotune_deallocate(autotune_handle);        // cleanup autotuning
+```
 
-DESCRIPTION
-       Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the
-       get_elpa_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols.
-       The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues
-       will be stored in q. All memory of the arguments must be allocated outside the call to the solver.
 
+## VII) A simple example how to use ELPA in an MPI application ##
 
-The *ELPA 1stage* solver, does not need or accept any other parameters than in the above
-specification.
+The following is a skeleton code of an basic example on how to use ELPA. The purpose is to show the steps that have 
+to be done in the application using MPI which wants to call ELPA, namely
 
-#### Using *ELPA 2stage* ####
+- Initializing the MPI
+- creating a blacs distributed matrics
+- using this matrix within ELPA
 
-The *ELPA 2stage* solver can be used in the same manner, as the *ELPA 1stage* solver.
-However, the 2 stage solver, can be used with different compute kernels, which offers
-more possibilities for configuration.
+The skeleton is not ment to be copied and pasted, since the details will always be dependent on the application which should 
+call ELPA.
 
-It is recommended to first call the utillity program
+For simplicity only a Fortran example is shown
 
-elpa2_print_kernels
 
-which will tell all the compute kernels that can be used with *ELPA 2stage*". It will
-also give information, whether a kernel can be set via environment variables.
+```Fortran
 
-##### Using the default kernels #####
+use mpi
 
-If no kernel is set either via an environment variable or the *ELPA 2stage API* then
-the default kernels will be set.
+implicit none
 
-##### Setting the *ELPA 2stage* compute kernels #####
+integer :: mpierr, myid, nprocs
+integer :: np_cols, np_rows, npcol, nprow
+integer :: my_blacs_ctxt, sc_desc(9), info
+integer :: na = [some value] ! global dimension of the matrix to be solved
+integer :: nblk = [some value ] ! the block size of the scalapack block cyclic distribution
+real*8, allocatable :: a(:,:), ev(:)
 
-If the *ELPA* installation allows setting ther compute kernels with enviroment variables,
-setting the variables "REAL_ELPA_KERNEL" and "COMPLEX_ELPA_KERNEL" will set the compute
-kernels. The environment variable setting will take precedence over all other settings!
+!-------------------------------------------------------------------------------
+!  MPI Initialization
 
-It is also possible to set the *ELPA 2stage* compute kernels via the API.
+call mpi_init(mpierr)
+call mpi_comm_rank(mpi_comm_world,myid,mpierr)
+call mpi_comm_size(mpi_comm_world,nprocs,mpierr)  
 
-SYNOPSIS
-   FORTRAN INTERFACE
-       use elpa1 use elpa2
-       success = solve_evp_real_2stage (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows,
-       mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQr=useQR)
+!-------------------------------------------------------------------------------
+! Selection of number of processor rows/columns
+! the application has to decide how the matrix should be distributed
+np_cols = [ some value ]
+np_rows = [ some value ]
 
-       With the definintions of the input and output variables:
 
-       integer, intent(in)            na:            global dimension of quadratic matrix a to solve
-       integer, intent(in)            nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
-       real*8,  intent(inout)         a:             locally distributed part of the matrix a. The local dimensions are lda x matrixCols
-       integer, intent(in)            lda:           leading dimension of locally distributed matrix a
-       real*8,  intent(inout)         ev:            on output the first nev computed eigenvalues
-       real*8,  intent(inout)         q:             on output the first nev computed eigenvectors
-       integer, intent(in)            ldq:           leading dimension of matrix q which stores the eigenvectors
-       integer, intent(in)            nblk:          blocksize of block cyclic distributin, must be the same in both directions
-       integer, intent(in)            matrixCols:    number of columns of locally distributed matrices a and q
-       integer, intent(in)            mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3)
-       integer, intent(in)            mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3)
-       integer, intent(in)            mpi_comm_all:  communicator for all processes in the processor set involved in ELPA
-       logical, intent(in), optional: useQR:         optional argument; switches to QR-decomposition if set to .true.
+!-------------------------------------------------------------------------------
+! Set up BLACS context and MPI communicators
+!
+! The BLACS context is only necessary for using Scalapack.
+!
+! For ELPA, the MPI communicators along rows/cols are sufficient,
+! and the grid setup may be done in an arbitrary way as long as it is
+! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
+! process has a unique (my_prow,my_pcol) pair).
+! For details look at the documentation of  BLACS_Gridinit and
+! BLACS_Gridinfo of your BLACS installation
 
-      logical                        success:       return value indicating success or failure
+my_blacs_ctxt = mpi_comm_world
+call BLACS_Gridinit( my_blacs_ctxt, 'C', np_rows, np_cols )
+call BLACS_Gridinfo( my_blacs_ctxt, nprow, npcol, my_prow, my_pcol )
 
-   C INTERFACE
-       #include "elpa.h"
+! compute for your distributed matrix the number of local rows and columns 
+! per MPI task, e.g. with
+! the Scalapack tools routine NUMROC 
 
-       success = solve_evp_real_2stage (int na, int nev,  double *a, int lda,  double *ev, double *q, int ldq, int nblk, int matrixCols, int
-       mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_ELPA_REAL_KERNEL, int useQr);
+! Set up a scalapack descriptor for the checks below.
+! For ELPA the following restrictions hold:
+! - block sizes in both directions must be identical (args 4+5)
+! - first row and column of the distributed matrix must be on row/col 0/0 (args 6+7)
 
-       With the definintions of the input and output variables:
+call descinit( sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info )
 
-       int     na:            global dimension of quadratic matrix a to solve
-       int     nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
-       double *a:             pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols
-       int     lda:           leading dimension of locally distributed matrix a
-       double *ev:            pointer to memory containing on output the first nev computed eigenvalues
-       double *q:             pointer to memory containing on output the first nev computed eigenvectors
-       int     ldq:           leading dimension of matrix q which stores the eigenvectors
-       int     nblk:          blocksize of block cyclic distributin, must be the same in both directions
-       int     matrixCols:    number of columns of locally distributed matrices a and q
-       int     mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3)
-       int     mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3)
-       int     mpi_comm_all:  communicator for all processes in the processor set involved in ELPA
-       int     useQR:         if set to 1 switch to QR-decomposition
+! Allocate matrices 
 
-       int     success:       return value indicating success (1) or failure (0)
+allocate(a (na_rows,na_cols))
+allocate(ev(na))
 
+! fill the matrix with resonable values
 
-DESCRIPTION
-       Solve the real eigenvalue problem with the 2-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the
-       get_elpa_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols.
-       The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues
-       will be stored in q. All memory of the arguments must be allocated outside the call to the solver.
+a(i,j) = [ your problem to be solved]
 
-SYNOPSIS
-   FORTRAN INTERFACE
-       use elpa1 use elpa2
-       success = solve_evp_real_2stage (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows,
-       mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL)
+! UP to this point this where all the prerequisites which have to be done in the
+! application if you have a distributed eigenvalue problem to be solved, independent of
+! whether you want to use ELPA, Scalapack, EigenEXA or alike
 
-       With the definintions of the input and output variables:
+! Now you can start using ELPA
 
-       integer,     intent(in)    na:            global dimension of quadratic matrix a to solve
-       integer,     intent(in)    nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
-       complex*16,  intent(inout) a:             locally distributed part of the matrix a. The local dimensions are lda x matrixCols
-       integer,     intent(in)    lda:           leading dimension of locally distributed matrix a
-       real*8,      intent(inout) ev:            on output the first nev computed eigenvalues
-       complex*16,  intent(inout) q:             on output the first nev computed eigenvectors
-       integer,     intent(in)    ldq:           leading dimension of matrix q which stores the eigenvectors
-       integer,     intent(in)    nblk:          blocksize of block cyclic distributin, must be the same in both directions
-       integer,     intent(in)    matrixCols:    number of columns of locally distributed matrices a and q
-       integer,     intent(in)    mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3)
-       integer,     intent(in)    mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3)
-       integer,     intent(in)    mpi_comm_all:  communicator for all processes in the processor set involved in ELPA
-       logical                    success:       return value indicating success or failure
+if (elpa_init(20171201) /= ELPA_OK) then        ! put here the API version that you are using
+   print *, "ELPA API version not supported"
+   stop
+ endif
+ elpa => elpa_allocate(success)
+ if (success != ELPA_OK) then
+   ! react on the error
+   ! we urge every user to always check the error codes
+   ! of all ELPA functions
+ endif
 
-   C INTERFACE
-       #include "elpa.h"
-       #include <complex.h>
+ ! set parameters decribing the matrix and it's MPI distribution
+ call elpa%set("na", na, success)                          ! size of the na x na matrix
+ call elpa%set("nev", nev, success)                        ! number of eigenvectors that should be computed ( 1<= nev <= na)
+ call elpa%set("local_nrows", na_rows, success)            ! number of local rows of the distributed matrix on this MPI task 
+ call elpa%set("local_ncols", na_cols, success)            ! number of local columns of the distributed matrix on this MPI task
+ call elpa%set("nblk", nblk, success)                      ! size of the BLACS block cyclic distribution
+ call elpa%set("mpi_comm_parent", MPI_COMM_WORLD, success) ! the global MPI communicator
+ call elpa%set("process_row", my_prow, success)            ! row coordinate of MPI process
+ call elpa%set("process_col", my_pcol, success)            ! column coordinate of MPI process
 
-       success = solve_evp_complex_2stage (int na, int nev,  double complex *a, int lda,  double *ev, double complex *q, int ldq, int nblk, int
-       matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_ELPA_REAL_KERNEL);
+ success = elpa%setup()
 
-       With the definintions of the input and output variables:
+ ! if desired, set any number of tunable run-time options
+ ! look at the list of possible options as detailed later in
+ ! USERS_GUIDE.md
+ call e%set("solver", ELPA_SOLVER_2STAGE, success)
 
-       int             na:            global dimension of quadratic matrix a to solve
-       int             nev:           number of eigenvalues to be computed; the first nev eigenvalules are calculated
-       double complex *a:             pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols
-       int             lda:           leading dimension of locally distributed matrix a
-       double         *ev:            pointer to memory containing on output the first nev computed eigenvalues
-       double complex *q:             pointer to memory containing on output the first nev computed eigenvectors
-       int             ldq:           leading dimension of matrix q which stores the eigenvectors
-       int             nblk:          blocksize of block cyclic distributin, must be the same in both directions
-       int             matrixCols:    number of columns of locally distributed matrices a and q
-       int             mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3)
-       int             mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3)
-       int             mpi_comm_all:  communicator for all processes in the processor set involved in ELPA
-       int             success:       return value indicating success (1) or failure (0)
+ ! set the AVX BLOCK2 kernel, otherwise ELPA_2STAGE_REAL_DEFAULT will
+ ! be used
+ call e%set("real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, success)
 
-DESCRIPTION
-       Solve the complex eigenvalue problem with the 2-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the
-       get_elpa_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols.
-       The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues
-       will be stored in q. All memory of the arguments must be allocated outside the call to the solver.
+ ! use method solve to solve the eigenvalue problem to obtain eigenvalues
+ ! and eigenvectors
+ ! other possible methods are desribed in USERS_GUIDE.md
+ call e%eigenvectors(a, ev, z, success)
 
+ ! cleanup
+ call elpa_deallocate(e)
 
+ call elpa_uninit()
+```